import pandas as pd

df = pd.read_csv('dataset.csv')

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                        Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                        --------------  ----- 
 0   age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"  41188 non-null  object
dtypes: object(1)
memory usage: 321.9+ KB

with open("dataset.csv", "r", encoding="utf-8") as f:
    lines = f.readlines()

data = [line.strip().replace('"', '').split(';') for line in lines]

df = pd.DataFrame(data[1:], columns=data[0])

df.to_csv("cleaned_dataset.csv", index=False)

df = pd.read_csv('cleaned_dataset.csv')

df.head()

numeric_columns = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)

duplicate = df[df.duplicated()]
duplicate

df = df.drop_duplicates()

missing_df = pd.DataFrame(df.isnull().sum())
missing_df

unknown_df = pd.DataFrame((df == 'unknown').sum() / len(df) * 100)
unknown_df

df = df[df['marital'] != 'unknown']
df = df[df['job'] != 'unknown']

df = df.drop('duration', axis=1)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['age'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['campaign'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['pdays'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['previous'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['emp.var.rate'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['cons.price.idx'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['cons.conf.idx'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['euribor3m'])
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['nr.employed'])
plt.show()

numeric_columns = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

outlier_percentages = {}

for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    percent_outliers = (len(outliers) / len(df)) * 100
    outlier_percentages[col] = round(percent_outliers, 2)

outlier_df = pd.DataFrame.from_dict(outlier_percentages, orient='index', columns=['Outlier %'])

outlier_df

ohe_cols = ['education', 'default', 'housing', 'loan', 'job', 'marital', 'contact', 'month', 'day_of_week', 'poutcome']

df = pd.get_dummies(df, columns=ohe_cols, drop_first=False)

df['y'] = df['y'].map({'yes': 1, 'no': 0})

df['y'] = df.pop('y')

df['y'] = df['y'].astype(bool)

df_numeric = df[numeric_columns]

corr = df_numeric.corr()
corr.style.background_gradient(cmap='coolwarm')

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(df[numeric_columns])  

vif = pd.DataFrame()
vif['feature'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

          feature            VIF
0           const  529178.488786
1             age       1.018425
2        campaign       1.033281
3           pdays       1.608952
4        previous       1.791291
5    emp.var.rate      33.087332
6  cons.price.idx       6.342704
7   cons.conf.idx       2.645638
8       euribor3m      64.279331
9     nr.employed      31.649421

df.drop(['euribor3m', 'nr.employed'], axis=1, inplace=True)

numeric_columns = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx']

X = add_constant(df[numeric_columns])  

vif = pd.DataFrame()
vif['feature'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

          feature           VIF
0           const  71863.225169
1             age      1.018222
2        campaign      1.025667
3           pdays      1.601893
4        previous      1.743881
5    emp.var.rate      3.356323
6  cons.price.idx      2.760853
7   cons.conf.idx      1.127746

import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.fit_transform(X_test)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)
y_prob = lr.predict_proba(X_test_scaled)[:,1]

print("Logistic Regression Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

y_pred_proba = lr.predict_proba(X_test_scaled)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Logistic Regression Report:
              precision    recall  f1-score   support

       False       0.91      0.98      0.94     10832
        True       0.65      0.24      0.35      1401

    accuracy                           0.90     12233
   macro avg       0.78      0.61      0.65     12233
weighted avg       0.88      0.90      0.88     12233

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
y_prob = knn.predict_proba(X_test_scaled)[:,1]

print("K-Nearest Neighbors Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - KNN")
plt.show()

y_pred_proba = knn.predict_proba(X_test_scaled)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

K-Nearest Neighbors Report:
              precision    recall  f1-score   support

       False       0.91      0.97      0.94     10832
        True       0.52      0.24      0.33      1401

    accuracy                           0.89     12233
   macro avg       0.71      0.61      0.63     12233
weighted avg       0.86      0.89      0.87     12233

from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
y_prob = nb.predict_proba(X_test)[:,1]

print("Gaussian Naive Bayes Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - Gaussian Naive Bayes")
plt.show()

y_pred_proba = nb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Gaussian Naive Bayes Report:
              precision    recall  f1-score   support

       False       0.92      0.92      0.92     10832
        True       0.40      0.42      0.41      1401

    accuracy                           0.86     12233
   macro avg       0.66      0.67      0.66     12233
weighted avg       0.86      0.86      0.86     12233

from sklearn.naive_bayes import CategoricalNB

data = df.copy()

data['age_bin'] = pd.qcut(data['age'], q=4, labels=False)
data['campaign_bin'] = pd.qcut(data['campaign'].rank(method='first'), q=4, labels=False)
data['pdays_bin'] = pd.qcut(data['pdays'].rank(method='first'), q=4, labels=False)
data['previous_bin'] = pd.qcut(data['previous'].rank(method='first'), q=4, labels=False)
data['emp_bin'] = pd.qcut(data['emp.var.rate'], q=2, labels=False)
data['price_bin'] = pd.qcut(data['cons.price.idx'], q=3, labels=False)
data['conf_bin'] = pd.qcut(data['cons.conf.idx'], q=3, labels=False)

data = data.drop(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx'], axis=1)

for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

X_cnb = data.drop("y", axis=1)
y_cnb = data["y"]

X_train_cnb, X_test_cnb, y_train_cnb, y_test_cnb = train_test_split(X_cnb, y_cnb, test_size=0.3, random_state=25)

cnb = CategoricalNB()
cnb.fit(X_train_cnb, y_train_cnb)

y_pred_cnb = cnb.predict(X_test_cnb)
y_prob_cnb = cnb.predict_proba(X_test_cnb)[:, 1]

print("Categorical Naive Bayes Report:")
print(classification_report(y_test_cnb, y_pred_cnb))

sns.heatmap(confusion_matrix(y_test_cnb, y_pred_cnb), annot=True, cmap="YlGnBu", fmt='g')
plt.title("Confusion Matrix - Categorical Naive Bayes")
plt.show()

y_pred_proba = cnb.predict_proba(X_test_cnb)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test_cnb,  y_pred_proba)
auc = metrics.roc_auc_score(y_test_cnb, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Categorical Naive Bayes Report:
              precision    recall  f1-score   support

           0       0.94      0.84      0.88     10832
           1       0.31      0.58      0.41      1401

    accuracy                           0.81     12233
   macro avg       0.63      0.71      0.65     12233
weighted avg       0.87      0.81      0.83     12233

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]

print("Decision Tree Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - Decision Tree")
plt.show()

y_pred_proba = dt.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Decision Tree Report:
              precision    recall  f1-score   support

       False       0.91      0.90      0.91     10832
        True       0.30      0.32      0.31      1401

    accuracy                           0.84     12233
   macro avg       0.61      0.61      0.61     12233
weighted avg       0.84      0.84      0.84     12233

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:,1]

print("Random Forest Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - Random Forest")
plt.show()

y_pred_proba = rf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Random Forest Report:
              precision    recall  f1-score   support

       False       0.91      0.97      0.94     10832
        True       0.53      0.27      0.36      1401

    accuracy                           0.89     12233
   macro avg       0.72      0.62      0.65     12233
weighted avg       0.87      0.89      0.87     12233

from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:,1]

print("XGBoost Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - XGBoost")
plt.show()

y_pred_proba = xgb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

XGBoost Report:
              precision    recall  f1-score   support

       False       0.91      0.97      0.94     10832
        True       0.58      0.27      0.37      1401

    accuracy                           0.89     12233
   macro avg       0.74      0.62      0.65     12233
weighted avg       0.87      0.89      0.88     12233

data = df.copy()

for col in data.select_dtypes(include='bool').columns:
    data[col] = data[col].astype(int)

X_tdt = data.drop("y", axis=1)
y_tdt = data["y"]

X_train_tdt, X_test_tdt, y_train_tdt, y_test_tdt = train_test_split(X_tdt, y_tdt, test_size=0.3, random_state=25)

tdt = DecisionTreeClassifier()
tdt.fit(X_train_tdt, y_train_tdt)

DecisionTreeClassifier()

DecisionTreeClassifier()

dt.score(X_train_tdt, y_train_tdt), dt.score(X_test_tdt, y_test_tdt)

(0.9947796230117021, 0.8363443145589798)

from sklearn.tree import plot_tree
plt.figure(figsize=(20,10)) 
plot_tree(tdt, filled=True, feature_names=X_train_tdt.columns.tolist(), class_names=[str(cls) for cls in tdt.classes_])
plt.show()

tree_info = tdt.tree_
num_nodes = tree_info.node_count
num_leaves = tree_info.n_leaves
num_decision = num_nodes - num_leaves

print("Number of nodes:", num_nodes)
print("Number of leaves:", num_leaves)
print("Number of decision:", num_decision)

Number of nodes: 8963
Number of leaves: 4482
Number of decision: 4481

tdt = DecisionTreeClassifier(max_depth = 3, random_state=0)
tdt.fit(X_train_tdt, y_train_tdt)
tdt.score(X_train_tdt, y_train_tdt), tdt.score(X_test_tdt, y_test_tdt)

(0.9015836311400742, 0.8969181721572795)

plt.figure(figsize=(20,10)) 
plot_tree(tdt, filled=True, feature_names=X_train_tdt.columns.tolist(), class_names=[str(cls) for cls in tdt.classes_])
plt.show()

tree_info = tdt.tree_
num_nodes = tree_info.node_count
num_leaves = tree_info.n_leaves
num_decision = num_nodes - num_leaves

print("Number of nodes:", num_nodes)
print("Number of leaves:", num_leaves)
print("Number of decision:", num_decision)

Number of nodes: 15
Number of leaves: 8
Number of decision: 7

tdt = DecisionTreeClassifier(max_depth = 4, random_state=0)
tdt.fit(X_train_tdt, y_train_tdt)
tdt.score(X_train_tdt, y_train_tdt), tdt.score(X_test_tdt, y_test_tdt)

(0.9020040641861117, 0.8970816643505273)

plt.figure(figsize=(20,10)) 
plot_tree(tdt, filled=True, feature_names=X_train_tdt.columns.tolist(), class_names=[str(cls) for cls in tdt.classes_])
plt.show()

tree_info = tdt.tree_
num_nodes = tree_info.node_count
num_leaves = tree_info.n_leaves
num_decision = num_nodes - num_leaves

print("Number of nodes:", num_nodes)
print("Number of leaves:", num_leaves)
print("Number of decision:", num_decision)

Number of nodes: 31
Number of leaves: 16
Number of decision: 15

depths = [3, 4, 5, 6, 7, 8, 9]
train_accuracies = []
test_accuracies = []

for depth in depths:
    tdt = DecisionTreeClassifier(max_depth=depth, random_state=0)
    tdt.fit(X_train_tdt, y_train_tdt)
    train_accuracies.append(tdt.score(X_train_tdt, y_train_tdt))
    test_accuracies.append(tdt.score(X_test_tdt, y_test_tdt))

plt.figure(figsize=(10, 6))
plt.plot(depths, train_accuracies, marker='o', label='Train Accuracy')
plt.plot(depths, test_accuracies, marker='o', label='Test Accuracy')
plt.title('Decision Tree Performance vs Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.xticks(depths)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

minSamplesSplits = list(range(10, 50))
train_accuracies = []
test_accuracies = []

for minSamplesSplit in minSamplesSplits:
    tdt = DecisionTreeClassifier(max_depth=6, min_samples_split = minSamplesSplit, random_state=0)
    tdt.fit(X_train_tdt, y_train_tdt)
    train_accuracies.append(tdt.score(X_train_tdt, y_train_tdt))
    test_accuracies.append(tdt.score(X_test_tdt, y_test_tdt))

plt.figure(figsize=(10, 6))
plt.plot(minSamplesSplits, train_accuracies, marker='o', label='Train Accuracy')
plt.plot(minSamplesSplits, test_accuracies, marker='o', label='Test Accuracy')
plt.title('Decision Tree Performance vs minSamplesSplits')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.xticks(minSamplesSplits)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'max_depth': list(range(2, 15)),
    'min_samples_split': list(range(10, 40)),  
    'min_samples_leaf': list(range(10, 40)),
    'max_features': [None, 'sqrt', 'log2']  
}

random_search = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(random_state=0),
    param_distributions=param_dist,
    n_iter=100, 
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=0,
    verbose=1
)

random_search.fit(X_train_tdt, y_train_tdt)
print("Best Parameters:", random_search.best_params_)

best_model = random_search.best_estimator_
test_score = best_model.score(X_test_tdt, y_test_tdt)
print("Test Set Score:", test_score)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'min_samples_split': 12, 'min_samples_leaf': 11, 'max_features': None, 'max_depth': 3}
Test Set Score: 0.8969181721572795

tdt = DecisionTreeClassifier(min_samples_split=12, min_samples_leaf=11, max_features=None, max_depth=3)
tdt.fit(X_train_tdt, y_train_tdt)
tdt.score(X_train_tdt, y_train_tdt), tdt.score(X_test_tdt, y_test_tdt)

(0.9015836311400742, 0.8969181721572795)

plt.figure(figsize=(20,10)) 
plot_tree(tdt, filled=True, feature_names=X_train_tdt.columns.tolist(), class_names=[str(cls) for cls in tdt.classes_])
plt.show()

tree_info = tdt.tree_
num_nodes = tree_info.node_count
num_leaves = tree_info.n_leaves
num_decision = num_nodes - num_leaves

print("Number of nodes:", num_nodes)
print("Number of leaves:", num_leaves)
print("Number of decision:", num_decision)

Number of nodes: 15
Number of leaves: 8
Number of decision: 7

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(min_samples_split=12, min_samples_leaf=11, max_features=None, max_depth=3)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]

print("Decision Tree Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - Decision Tree")
plt.show()

y_pred_proba = dt.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Decision Tree Report:
              precision    recall  f1-score   support

       False       0.90      0.99      0.94     10832
        True       0.68      0.19      0.30      1401

    accuracy                           0.90     12233
   macro avg       0.79      0.59      0.62     12233
weighted avg       0.88      0.90      0.87     12233

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]

print("Decision Tree Report:")
print(classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title("Confusion Matrix - Decision Tree")
plt.show()

y_pred_proba = dt.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Decision Tree Report:
              precision    recall  f1-score   support

       False       0.91      0.90      0.91     10832
        True       0.30      0.32      0.31      1401

    accuracy                           0.84     12233
   macro avg       0.61      0.61      0.61     12233
weighted avg       0.84      0.84      0.84     12233

	age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"
0	56;"housemaid";"married";"basic.4y";"no";"no";...
1	57;"services";"married";"high.school";"unknown...
2	37;"services";"married";"high.school";"no";"ye...
3	40;"admin.";"married";"basic.6y";"no";"no";"no...
4	56;"services";"married";"high.school";"no";"no...

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
0	56	housemaid	married	basic.4y	no	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
1	57	services	married	high.school	unknown	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
2	37	services	married	high.school	no	yes	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
3	40	admin.	married	basic.6y	no	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
4	56	services	married	high.school	no	no	yes	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
1266	39	blue-collar	married	basic.6y	no	no	no	telephone	may	thu	...	1	999	nonexistent	1.1	93.994	-36.4	4.855	5191.0	no
12261	36	retired	married	unknown	no	no	no	telephone	jul	thu	...	1	999	nonexistent	1.4	93.918	-42.7	4.966	5228.1	no
14234	27	technician	single	professional.course	no	no	no	cellular	jul	mon	...	2	999	nonexistent	1.4	93.918	-42.7	4.962	5228.1	no
16956	47	technician	divorced	high.school	no	yes	no	cellular	jul	thu	...	3	999	nonexistent	1.4	93.918	-42.7	4.962	5228.1	no
18465	32	technician	single	professional.course	no	yes	no	cellular	jul	thu	...	1	999	nonexistent	1.4	93.918	-42.7	4.968	5228.1	no
20216	55	services	married	high.school	unknown	no	no	cellular	aug	mon	...	1	999	nonexistent	1.4	93.444	-36.1	4.965	5228.1	no
20534	41	technician	married	professional.course	no	yes	no	cellular	aug	tue	...	1	999	nonexistent	1.4	93.444	-36.1	4.966	5228.1	no
25217	39	admin.	married	university.degree	no	no	no	cellular	nov	tue	...	2	999	nonexistent	-0.1	93.200	-42.0	4.153	5195.8	no
28477	24	services	single	high.school	no	yes	no	cellular	apr	tue	...	1	999	nonexistent	-1.8	93.075	-47.1	1.423	5099.1	no
32516	35	admin.	married	university.degree	no	yes	no	cellular	may	fri	...	4	999	nonexistent	-1.8	92.893	-46.2	1.313	5099.1	no
36951	45	admin.	married	university.degree	no	no	no	cellular	jul	thu	...	1	999	nonexistent	-2.9	92.469	-33.6	1.072	5076.2	yes
38281	71	retired	single	university.degree	no	no	no	telephone	oct	tue	...	1	999	nonexistent	-3.4	92.431	-26.9	0.742	5017.5	no

	0
age	0.000000
job	0.801438
marital	0.194288
education	4.201477
default	20.876239
housing	2.404313
loan	2.404313
contact	0.000000
month	0.000000
day_of_week	0.000000
duration	0.000000
campaign	0.000000
pdays	0.000000
previous	0.000000
poutcome	0.000000
emp.var.rate	0.000000
cons.price.idx	0.000000
cons.conf.idx	0.000000
euribor3m	0.000000
nr.employed	0.000000
y	0.000000

	age	campaign	pdays	previous	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed
age	1.000000	0.003414	-0.034230	0.024465	-0.000586	0.000159	0.127881	0.010448	-0.017610
campaign	0.003414	1.000000	0.052371	-0.078932	0.150160	0.127052	-0.013229	0.134621	0.143470
pdays	-0.034230	0.052371	1.000000	-0.586168	0.269956	0.078485	-0.092240	0.295609	0.371244
previous	0.024465	-0.078932	-0.586168	1.000000	-0.419486	-0.202290	-0.051704	-0.453549	-0.500352
emp.var.rate	-0.000586	0.150160	0.269956	-0.419486	1.000000	0.775292	0.198133	0.972231	0.906802
cons.price.idx	0.000159	0.127052	0.078485	-0.202290	0.775292	1.000000	0.061180	0.687997	0.521496
cons.conf.idx	0.127881	-0.013229	-0.092240	-0.051704	0.198133	0.061180	1.000000	0.279367	0.102142
euribor3m	0.010448	0.134621	0.295609	-0.453549	0.972231	0.687997	0.279367	1.000000	0.945131
nr.employed	-0.017610	0.143470	0.371244	-0.500352	0.906802	0.521496	0.102142	0.945131	1.000000

	Outlier %
age	1.13
campaign	5.82
pdays	3.65
previous	13.66
emp.var.rate	0.00
cons.price.idx	0.00
cons.conf.idx	1.07
euribor3m	0.00
nr.employed	0.00

Data Preprocessing¶

Changing the delimiter -¶

Dataset Overview -¶

Bank Client Data:¶

Related with the last contact of the current campaign:¶

Other attributes:¶

Social and economic context attributes:¶

Output variable (desired target):¶

Datatype conversion to numeric -¶

Handling duplicates -¶

Handling missing data values -¶

Handling the 'unknown' data values -¶

Dropping the 'duration' column -¶

Outlier detection using boxplots -¶

Outlier detection using the IQR method -¶

OHE of categorical features -¶

Performing label encoding and datatype conversion on the target 'y' -¶

Correlation analysis on numeric columns -¶

Using Variation Inflation Factor (VIF) to measure the amount of multicollinearity -¶

Dropping highly correlated features -¶

Recalculating VIF -¶

Predictive Analysis¶

Splitting the dataset -¶

Scalling the input data -¶

1) Logistic Regression¶

2) KNN¶

3) Gaussian Naive Bayes¶

4) Categorical Naive Bayes¶

5) Decision Tree¶

6) Random Forest¶

7) XGBoost¶

Hyperparameter Tuning of Decision Tree¶

Initial Train and Test accuracy -¶

Initial tree visualization -¶

Initial tree structure -¶

Tuning the 'max_depth' parameter and finding the accuracy -¶

Visualizing the tree -¶

Tuning the 'max_depth' parameter and finding the accuracy -¶

Visualizing the tree -¶

Visualing the effect of different 'max_depth' values on model's performance -¶

Introducing another hyperparameter and visualing the effect of different 'min_samples_split' values on model's performance -¶

Automating hyperparameter tuning by using Random Search Cross-Validation -¶

Evaluating the model after hyperparameter tuning -¶

Visualizing the decision tree after hyperparameter tuning -¶

Rerunning the model and evaluating the performance after tuning -¶

Models performance before tuning -¶