In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_digits, fetch_california_housing

# Load the digits dataset
digits = load_digits()
digits_df = pd.DataFrame(data=digits.data, columns=[f'pixel_{i}' for i in range(digits.data.shape[1])])
digits_df['target'] = digits.target


# Load the California housing dataset
housing = fetch_california_housing()
housing_df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
housing_df['MedHouseVal'] = housing.target

# EDA for California housing dataset
print(housing_df.info())
print("\n")
print(housing_df.head())
print("\n")
print(housing_df.describe())
print("\n")
sns.histplot(housing_df['MedHouseVal'], bins=30)
plt.title('Distribution of Median House Values')
plt.show()

# EDA for digits dataset
print(digits_df.info())
print("\n")
print(digits_df.describe())
print("\n")
sns.countplot(x='target', data=digits_df)
plt.title('Distribution of Digits')
plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  
count  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558  
std       10.386050      2.135952      2.003532      1.153956  
min        0.692308     32.540000   -124.350000      0.149990  
25%        2.429741     33.930000   -121.800000      1.196000  
50%        2.818116     34.260000   -118.490000      1.797000  
75%        3.282261     37.710000   -118.010000      2.647250  
max     1243.333333     41.950000   -114.310000      5.000010  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 65 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pixel_0   1797 non-null   float64
 1   pixel_1   1797 non-null   float64
 2   pixel_2   1797 non-null   float64
 3   pixel_3   1797 non-null   float64
 4   pixel_4   1797 non-null   float64
 5   pixel_5   1797 non-null   float64
 6   pixel_6   1797 non-null   float64
 7   pixel_7   1797 non-null   float64
 8   pixel_8   1797 non-null   float64
 9   pixel_9   1797 non-null   float64
 10  pixel_10  1797 non-null   float64
 11  pixel_11  1797 non-null   float64
 12  pixel_12  1797 non-null   float64
 13  pixel_13  1797 non-null   float64
 14  pixel_14  1797 non-null   float64
 15  pixel_15  1797 non-null   float64
 16  pixel_16  1797 non-null   float64
 17  pixel_17  1797 non-null   float64
 18  pixel_18  1797 non-null   float64
 19  pixel_19  1797 non-null   float64
 20  pixel_20  1797 non-null   float64
 21  pixel_21  1797 non-null   float64
 22  pixel_22  1797 non-null   float64
 23  pixel_23  1797 non-null   float64
 24  pixel_24  1797 non-null   float64
 25  pixel_25  1797 non-null   float64
 26  pixel_26  1797 non-null   float64
 27  pixel_27  1797 non-null   float64
 28  pixel_28  1797 non-null   float64
 29  pixel_29  1797 non-null   float64
 30  pixel_30  1797 non-null   float64
 31  pixel_31  1797 non-null   float64
 32  pixel_32  1797 non-null   float64
 33  pixel_33  1797 non-null   float64
 34  pixel_34  1797 non-null   float64
 35  pixel_35  1797 non-null   float64
 36  pixel_36  1797 non-null   float64
 37  pixel_37  1797 non-null   float64
 38  pixel_38  1797 non-null   float64
 39  pixel_39  1797 non-null   float64
 40  pixel_40  1797 non-null   float64
 41  pixel_41  1797 non-null   float64
 42  pixel_42  1797 non-null   float64
 43  pixel_43  1797 non-null   float64
 44  pixel_44  1797 non-null   float64
 45  pixel_45  1797 non-null   float64
 46  pixel_46  1797 non-null   float64
 47  pixel_47  1797 non-null   float64
 48  pixel_48  1797 non-null   float64
 49  pixel_49  1797 non-null   float64
 50  pixel_50  1797 non-null   float64
 51  pixel_51  1797 non-null   float64
 52  pixel_52  1797 non-null   float64
 53  pixel_53  1797 non-null   float64
 54  pixel_54  1797 non-null   float64
 55  pixel_55  1797 non-null   float64
 56  pixel_56  1797 non-null   float64
 57  pixel_57  1797 non-null   float64
 58  pixel_58  1797 non-null   float64
 59  pixel_59  1797 non-null   float64
 60  pixel_60  1797 non-null   float64
 61  pixel_61  1797 non-null   float64
 62  pixel_62  1797 non-null   float64
 63  pixel_63  1797 non-null   float64
 64  target    1797 non-null   int64  
dtypes: float64(64), int64(1)
memory usage: 912.7 KB
None


       pixel_0      pixel_1      pixel_2      pixel_3      pixel_4  \
count   1797.0  1797.000000  1797.000000  1797.000000  1797.000000   
mean       0.0     0.303840     5.204786    11.835838    11.848080   
std        0.0     0.907192     4.754826     4.248842     4.287388   
min        0.0     0.000000     0.000000     0.000000     0.000000   
25%        0.0     0.000000     1.000000    10.000000    10.000000   
50%        0.0     0.000000     4.000000    13.000000    13.000000   
75%        0.0     0.000000     9.000000    15.000000    15.000000   
max        0.0     8.000000    16.000000    16.000000    16.000000   

           pixel_5      pixel_6      pixel_7      pixel_8      pixel_9  ...  \
count  1797.000000  1797.000000  1797.000000  1797.000000  1797.000000  ...   
mean      5.781859     1.362270     0.129661     0.005565     1.993879  ...   
std       5.666418     3.325775     1.037383     0.094222     3.196160  ...   
min       0.000000     0.000000     0.000000     0.000000     0.000000  ...   
25%       0.000000     0.000000     0.000000     0.000000     0.000000  ...   
50%       4.000000     0.000000     0.000000     0.000000     0.000000  ...   
75%      11.000000     0.000000     0.000000     0.000000     3.000000  ...   
max      16.000000    16.000000    15.000000     2.000000    16.000000  ...   

          pixel_55     pixel_56     pixel_57     pixel_58     pixel_59  \
count  1797.000000  1797.000000  1797.000000  1797.000000  1797.000000   
mean      0.206455     0.000556     0.279354     5.557596    12.089037   
std       0.984401     0.023590     0.934302     5.103019     4.374694   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     1.000000    11.000000   
50%       0.000000     0.000000     0.000000     4.000000    13.000000   
75%       0.000000     0.000000     0.000000    10.000000    16.000000   
max      13.000000     1.000000     9.000000    16.000000    16.000000   

          pixel_60     pixel_61     pixel_62     pixel_63       target  
count  1797.000000  1797.000000  1797.000000  1797.000000  1797.000000  
mean     11.809126     6.764051     2.067891     0.364496     4.490818  
std       4.933947     5.900623     4.090548     1.860122     2.865304  
min       0.000000     0.000000     0.000000     0.000000     0.000000  
25%      10.000000     0.000000     0.000000     0.000000     2.000000  
50%      14.000000     6.000000     0.000000     0.000000     4.000000  
75%      16.000000    12.000000     2.000000     0.000000     7.000000  
max      16.000000    16.000000    16.000000    16.000000     9.000000  

[8 rows x 65 columns]


In [23]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)

# Train Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

# Calculate accuracy
dt_accuracy = accuracy_score(y_test, y_pred_dt)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

# Print accuracy scores
print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("\n")

# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Decision Tree Confusion Matrix')

# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Random Forest Confusion Matrix')

plt.show()
Decision Tree Accuracy: 85.00%
Random Forest Accuracy: 97.50%


In [24]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tuning Decision Tree
dt_param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)
y_pred_dt_tuned = dt_grid_search.predict(X_test)

# Tuning Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
y_pred_rf_tuned = rf_grid_search.predict(X_test)

# Calculate accuracy for tuned models
dt_tuned_accuracy = accuracy_score(y_test, y_pred_dt_tuned)
rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)

# Print accuracy scores
print(f"Tuned Decision Tree Accuracy: {dt_tuned_accuracy * 100:.2f}%")
print(f"Tuned Random Forest Accuracy: {rf_tuned_accuracy * 100:.2f}%")
print("\n")
# Print Confusion Matrices
print("Tuned Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))
print("\n")
print("Tuned Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_tuned))

# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt_tuned)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Tuned Decision Tree Confusion Matrix')

# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf_tuned)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Tuned Random Forest Confusion Matrix')

plt.show()
Tuned Decision Tree Accuracy: 86.39%
Tuned Random Forest Accuracy: 97.50%


Tuned Decision Tree Confusion Matrix:
[[29  0  0  0  2  1  0  0  0  1]
 [ 0 21  1  0  3  0  0  1  1  1]
 [ 0  1 26  2  2  0  0  1  1  0]
 [ 0  0  0 30  0  0  1  0  1  2]
 [ 0  2  0  0 41  0  0  3  0  0]
 [ 0  0  0  1  1 45  0  0  0  0]
 [ 0  0  0  0  1  0 34  0  0  0]
 [ 0  0  0  2  2  0  0 30  0  0]
 [ 1  3  0  1  2  1  0  0 20  2]
 [ 0  0  0  3  0  0  0  2  0 35]]


Tuned Random Forest Confusion Matrix:
[[32  0  0  0  1  0  0  0  0  0]
 [ 0 28  0  0  0  0  0  0  0  0]
 [ 0  0 33  0  0  0  0  0  0  0]
 [ 0  0  0 33  0  1  0  0  0  0]
 [ 0  0  0  0 46  0  0  0  0  0]
 [ 0  0  0  0  0 45  1  0  0  1]
 [ 0  0  0  0  0  1 34  0  0  0]
 [ 0  0  0  0  0  0  0 33  0  1]
 [ 0  1  0  0  0  0  0  0 29  0]
 [ 0  0  0  0  0  1  0  1  0 38]]
In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split the dataset
X_housing = housing.data
y_housing = housing.target
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(X_housing, y_housing, test_size=0.2, random_state=42)

# Decision Tree Regressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train_housing, y_train_housing)
y_pred_dt_reg = dt_regressor.predict(X_test_housing)

# Random Forest Regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_housing, y_train_housing)
y_pred_rf_reg = rf_regressor.predict(X_test_housing)

# Metrics
print("Decision Tree Regression MSE:", mean_squared_error(y_test_housing, y_pred_dt_reg))
print("Decision Tree R^2:", r2_score(y_test_housing, y_pred_dt_reg))
print("Random Forest Regression MSE:", mean_squared_error(y_test_housing, y_pred_rf_reg))
print("Random Forest R^2:", r2_score(y_test_housing, y_pred_rf_reg))
Decision Tree Regression MSE: 0.4950806416996609
Decision Tree R^2: 0.6221937960435375
Random Forest Regression MSE: 0.2540368314402719
Random Forest R^2: 0.8061392773870548
In [25]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tuning Decision Tree using RandomizedSearchCV
dt_param_dist = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}
dt_random_search = RandomizedSearchCV(DecisionTreeClassifier(), dt_param_dist, n_iter=3, cv=3, random_state=42)
dt_random_search.fit(X_train, y_train)
y_pred_dt_tuned = dt_random_search.predict(X_test)

# Tuning Random Forest using RandomizedSearchCV
rf_param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10]
}
rf_random_search = RandomizedSearchCV(RandomForestClassifier(), rf_param_dist, n_iter=3, cv=3, random_state=42)
rf_random_search.fit(X_train, y_train)
y_pred_rf_tuned = rf_random_search.predict(X_test)

# Calculate accuracy for tuned models
dt_tuned_accuracy = accuracy_score(y_test, y_pred_dt_tuned)
rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)

# Print accuracy scores
print(f"Tuned Decision Tree Accuracy: {dt_tuned_accuracy * 100:.2f}%")
print(f"Tuned Random Forest Accuracy: {rf_tuned_accuracy * 100:.2f}%")

# Print Confusion Matrices
print("Tuned Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))
print("Tuned Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_tuned))

# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt_tuned)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Tuned Decision Tree Confusion Matrix')

# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf_tuned)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Tuned Random Forest Confusion Matrix')

plt.show()
Tuned Decision Tree Accuracy: 83.89%
Tuned Random Forest Accuracy: 98.33%
Tuned Decision Tree Confusion Matrix:
[[29  0  1  0  1  1  0  0  0  1]
 [ 0 22  1  0  1  0  0  1  2  1]
 [ 1  0 24  3  2  0  1  1  1  0]
 [ 0  0  1 30  1  0  0  0  1  1]
 [ 0  0  1  0 40  1  2  1  1  0]
 [ 0  0  4  0  1 40  1  0  1  0]
 [ 0  0  0  0  3  0 32  0  0  0]
 [ 0  0  0  1  2  0  0 30  0  1]
 [ 0  2  1  2  0  1  0  0 21  3]
 [ 0  0  0  2  1  0  0  2  1 34]]
Tuned Random Forest Confusion Matrix:
[[32  0  0  0  1  0  0  0  0  0]
 [ 0 28  0  0  0  0  0  0  0  0]
 [ 0  0 33  0  0  0  0  0  0  0]
 [ 0  0  0 33  0  1  0  0  0  0]
 [ 0  0  0  0 46  0  0  0  0  0]
 [ 0  0  0  0  0 47  0  0  0  0]
 [ 0  0  0  0  0  1 34  0  0  0]
 [ 0  0  0  0  0  0  0 33  0  1]
 [ 0  1  0  0  0  0  0  0 29  0]
 [ 0  0  0  0  0  0  0  1  0 39]]
In [28]:
# Feature Importance for Classification
print("Decision Tree Feature Importance:")
print(dt_classifier.feature_importances_)
print("\n")
print("Random Forest Feature Importance:")

print(rf_classifier.feature_importances_)
print("\n")

# Feature Importance for Regression
print("Decision Tree Regressor Feature Importance:")
print(dt_regressor.feature_importances_)
print("\n")
print("Random Forest Regressor Feature Importance:")
print(rf_regressor.feature_importances_)
Decision Tree Feature Importance:
[0.         0.01036013 0.00451836 0.00569966 0.00243391 0.05963066
 0.         0.         0.         0.00103113 0.03424813 0.00077334
 0.0116301  0.00564541 0.00292152 0.         0.00153196 0.
 0.01859643 0.01277246 0.05068993 0.10083092 0.00123735 0.
 0.00151764 0.00077334 0.07175765 0.06311472 0.00736352 0.00434546
 0.0091637  0.         0.         0.05878518 0.00221174 0.00625672
 0.07837266 0.01672474 0.00365746 0.         0.         0.00269306
 0.13063307 0.04837164 0.00077334 0.         0.01559676 0.
 0.         0.0021267  0.00709405 0.00858535 0.0007347  0.00123735
 0.01724102 0.00152988 0.         0.         0.00588908 0.00077334
 0.06531677 0.03235316 0.00144358 0.00901124]


Random Forest Feature Importance:
[0.00000000e+00 2.70631473e-03 2.14913233e-02 1.05092028e-02
 1.00978014e-02 1.64898689e-02 8.04677766e-03 8.26333795e-04
 6.37017762e-05 1.02515402e-02 2.73231438e-02 5.72023318e-03
 1.54534256e-02 2.61916936e-02 4.56128411e-03 5.37956710e-04
 7.06270618e-05 7.64725651e-03 2.13004239e-02 2.58066462e-02
 3.12641697e-02 4.74594251e-02 8.83202224e-03 6.92691968e-04
 1.45774767e-05 1.55011221e-02 4.10820139e-02 2.76058306e-02
 3.49759665e-02 2.16279796e-02 3.65400138e-02 4.24236622e-05
 0.00000000e+00 3.38336410e-02 2.45128606e-02 2.00785391e-02
 3.60605647e-02 1.89641527e-02 2.30882573e-02 0.00000000e+00
 2.26227020e-05 1.08058930e-02 3.24222740e-02 4.60223811e-02
 2.00973620e-02 2.07301330e-02 1.48531087e-02 8.16271063e-05
 2.87159687e-05 3.05501233e-03 1.62098922e-02 2.02596358e-02
 1.30036464e-02 2.28898713e-02 2.81774465e-02 1.99790301e-03
 0.00000000e+00 2.28124933e-03 2.16805731e-02 9.25950631e-03
 3.22491832e-02 2.59498824e-02 1.63248297e-02 4.32544363e-03]


Decision Tree Regressor Feature Importance:
[0.52747092 0.05184666 0.05279608 0.02815275 0.03082368 0.13166008
 0.09266352 0.08458631]


Random Forest Regressor Feature Importance:
[0.52525632 0.05456008 0.04325463 0.02928013 0.03073374 0.13886681
 0.08872977 0.08931852]