import numpy as np
import pandas as pd


df = pd.read_csv("depressionData.csv")
df


df.shape

(1503, 11)


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 11 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   Timestamp                                  1503 non-null   object
 1   Age                                        1503 non-null   object
 2   Feeling sad                                1503 non-null   object
 3   Irritable towards people                   1497 non-null   object
 4   Trouble sleeping at night                  1503 non-null   object
 5   Problems concentrating or making decision  1491 non-null   object
 6   loss of appetite                           1503 non-null   object
 7   Feeling of guilt                           1494 non-null   object
 8   Feeling anxious                            1503 non-null   object
 9   Suicide attempt                            1503 non-null   object
 10  Depressed                                  1503 non-null   object
dtypes: object(11)
memory usage: 129.3+ KB


df.describe()


df.isnull().sum()

Timestamp                                     0
Age                                           0
Feeling sad                                   0
Irritable towards people                      6
Trouble sleeping at night                     0
Problems concentrating or making decision    12
loss of appetite                              0
Feeling of guilt                              9
Feeling anxious                               0
Suicide attempt                               0
Depressed                                     0
dtype: int64


#filling all the missing values with the next available values
df['Irritable towards people'] = df['Irritable towards people'].fillna(method = 'bfill')
df['Problems concentrating or making decision'] = df['Problems concentrating or making decision'].fillna(method = 'bfill')
df['Feeling of guilt'] = df['Feeling of guilt'].fillna(method = 'bfill')


df.isnull().sum()

Timestamp                                    0
Age                                          0
Feeling sad                                  0
Irritable towards people                     0
Trouble sleeping at night                    0
Problems concentrating or making decision    0
loss of appetite                             0
Feeling of guilt                             0
Feeling anxious                              0
Suicide attempt                              0
Depressed                                    0
dtype: int64


df = df.dropna()


df.shape

(1503, 11)

df


df = df.drop(['Timestamp'], axis = 1)
df


import matplotlib.pyplot as plt
import seaborn as sns


df["Trouble sleeping at night"].value_counts()

Two or more days a week    640
Yes                        445
No                         418
Name: Trouble sleeping at night, dtype: int64


df["Feeling sad"].value_counts()

Yes          536
No           524
Sometimes    443
Name: Feeling sad, dtype: int64


df["Irritable towards people"].value_counts()

Yes          561
No           499
Sometimes    443
Name: Irritable towards people, dtype: int64


df["Problems concentrating or making decision"].value_counts()

No       583
Often    473
Yes      447
Name: Problems concentrating or making decision, dtype: int64


df["loss of appetite"].value_counts()

No            841
Yes           343
Not at all    319
Name: loss of appetite, dtype: int64


df["Feeling of guilt"].value_counts()

No       633
Maybe    528
Yes      342
Name: Feeling of guilt, dtype: int64


df["Feeling anxious"].value_counts()

No           557
Sometimes    542
Yes          404
Name: Feeling anxious, dtype: int64


df["Suicide attempt"].value_counts()

No                       709
Yes                      459
Not interested to say    335
Name: Suicide attempt, dtype: int64


df["Depressed"].value_counts()

Yes    980
No     523
Name: Depressed, dtype: int64


# Iterate through all columns
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column is categorical
        counts = df[col].value_counts()  # Get value counts of column
        counts.plot.bar(color=['red', 'blue', 'green', 'maroon', 'purple'])  # Generate a bar plot of value counts
        plt.title(col)  # Set the title of the plot to the column name
        plt.show()  # Show the plot


df['Feeling sad'].unique()

array(['Yes', 'No', 'Sometimes'], dtype=object)


ordinal=['Yes', 'No', 'Sometimes']


from sklearn.preprocessing import OrdinalEncoder
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Feeling sad']])

OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])


final=pd.DataFrame(ord.fit_transform(df[['Feeling sad']]))
final


df['Feeling sad']=final
df


df['Irritable towards people'].unique()

array(['Yes', 'No', 'Sometimes'], dtype=object)


ordinal=['Yes', 'No', 'Sometimes']


ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Irritable towards people']])

OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])


OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])

OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])


final2=pd.DataFrame(ord.fit_transform(df[['Irritable towards people']]))
final2


df['Irritable towards people']=final2
df


df['Trouble sleeping at night'].unique()

array(['Two or more days a week', 'No', 'Yes'], dtype=object)


ordinal=['Two or more days a week', 'No', 'Yes']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Trouble sleeping at night']])
final3=pd.DataFrame(ord.fit_transform(df[['Trouble sleeping at night']]))
final3


df['Trouble sleeping at night']=final3
df


df['Problems concentrating or making decision'].unique()

array(['Yes', 'No', 'Often'], dtype=object)


ordinal=['Yes', 'No', 'Often']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Problems concentrating or making decision']])
final4=pd.DataFrame(ord.fit_transform(df[['Problems concentrating or making decision']]))
final4


df['Problems concentrating or making decision']=final4
df


df['loss of appetite'].unique()

array(['Yes', 'No', 'Not at all'], dtype=object)


ordinal=['Yes', 'No', 'Not at all']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['loss of appetite']])
final5=pd.DataFrame(ord.fit_transform(df[['loss of appetite']]))
final5


df['loss of appetite']=final5
df


df['Feeling of guilt'].unique()

array(['No', 'Yes', 'Maybe'], dtype=object)


ordinal=['No', 'Yes', 'Maybe']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Feeling of guilt']])

OrdinalEncoder(categories=[['No', 'Yes', 'Maybe']])


final6=pd.DataFrame(ord.fit_transform(df[['Feeling of guilt']]))
final6


df['Feeling of guilt']=final6
df


df['Feeling anxious'].unique()

array(['Yes', 'Sometimes', 'No'], dtype=object)


ordinal=['Yes', 'Sometimes', 'No']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Feeling anxious']])
final7=pd.DataFrame(ord.fit_transform(df[['Feeling anxious']]))
final7


df['Feeling anxious']=final7
df


df['Age'].unique()

array(['35-40', '40-45', '30-35', '45-50', '25-30'], dtype=object)


ordinal=['35-40', '40-45', '30-35', '45-50', '25-30']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Age']])
final8=pd.DataFrame(ord.fit_transform(df[['Age']]))
final8


df['Age']=final8
df


df['Suicide attempt'].unique()

array(['Yes', 'No', 'Not interested to say'], dtype=object)


ordinal=['Yes', 'No', 'Not interested to say']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Suicide attempt']])
final9=pd.DataFrame(ord.fit_transform(df[['Suicide attempt']]))
final9


df['Suicide attempt']=final9
df


df['Depressed'].unique()

array(['Yes', 'No'], dtype=object)


ordinal=['Yes', 'No']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Depressed']])
final10=pd.DataFrame(ord.fit_transform(df[['Depressed']]))
final10


df['Depressed']=final10
df


fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, ax=ax, cmap="YlGnBu" )

<AxesSubplot:>


'''from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
x=df[df.columns[:9]]
y=df["Depressed"]
clf.fit(x,y)
feature_imp = pd.DataFrame(clf.feature_importances_,index=x.columns)
feature_imp.sort_values(by = 0 , ascending = False)'''

'from sklearn.ensemble import RandomForestClassifier\nclf = RandomForestClassifier()\nx=df[df.columns[:9]]\ny=df["Depressed"]\nclf.fit(x,y)\nfeature_imp = pd.DataFrame(clf.feature_importances_,index=x.columns)\nfeature_imp.sort_values(by = 0 , ascending = False)'


X= df[["Irritable towards people",'Trouble sleeping at night','Feeling sad','Feeling of guilt','Feeling anxious',"Problems concentrating or making decision","Suicide attempt"]]
y =df["Depressed"]

X

y

0       0.0
1       1.0
2       0.0
3       0.0
4       0.0
       ... 
1498    0.0
1499    0.0
1500    1.0
1501    0.0
1502    1.0
Name: Depressed, Length: 1503, dtype: float64


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


clf = RandomForestClassifier()
x=df[df.columns[:9]]
y=df["Depressed"]
clf.fit(x,y)
feature_imp = pd.DataFrame(clf.feature_importances_,index=x.columns)
feature_imp.sort_values(by = 0 , ascending = False)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Fit a Logistic Regression Classifier
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)

# Predict using the Logistic Regression Classifier
y_pred_lr = lr.predict(X_test)

# Calculate the accuracy of the Logistic Regression Classifier
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_lr

0.7807308970099668


# Fit a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

# Predict using the Decision Tree Classifier
y_pred_dt = dt.predict(X_test)

# Calculate the accuracy of the Decision Tree Classifier
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_dt

0.9501661129568106


# Fit a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

# Predict using the Random Forest Classifier
y_pred_rf = rf.predict(X_test)

# Calculate the accuracy of the Random Forest Classifier
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_rf

0.9501661129568106


# Fit a K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict using the K-Nearest Neighbors Classifier
y_pred_knn = knn.predict(X_test)

# Calculate the accuracy of the K-Nearest Neighbors Classifier
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_knn

0.8837209302325582


# Fit a Support Vector Machine Classifier
svm = SVC(random_state=0)
svm.fit(X_train, y_train)

# Predict using the Support Vector Machine Classifier
y_pred_svm = svm.predict(X_test)

# Calculate the accuracy of the Support Vector Machine Classifier
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_svm

0.8770764119601329


models = {
    "Random Forest": accuracy_rf,
    "Logistic Regression": accuracy_lr,
    "Decision Tree": accuracy_dt,
    "K-Nearest Neighbors": accuracy_knn,
    "Support Vector Machine": accuracy_svm
}

# Sort the models by accuracy
sorted_models = {k: v for k, v in sorted(models.items(), key=lambda item: item[1], reverse=True)}

# Print the accuracy of each model
for k, v in sorted_models.items():
    print(f"{k}: {v:.2f}")

Random Forest: 0.95
Decision Tree: 0.95
K-Nearest Neighbors: 0.88
Support Vector Machine: 0.88
Logistic Regression: 0.78


from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred_dt)

# Plot the confusion matrix as a heatmap
sns.heatmap(cm, annot=True, fmt="d")

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Decision Tree Classifier")
plt.show()

git

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-76-b40508769cb0> in <module>
----> 1 git

NameError: name 'git' is not defined

Data Cleaning¶

Data Visualization¶

Data Preprocessing¶

Splitting Data¶

Logistic Regression Classifier¶

Decision Teee Classifier¶

Random Forest Classifier¶

K-Nearest Neighbour Classifier¶

Support Vector Machine Classifier¶

Confusion Matrix¶

	Timestamp	Age	Feeling sad	Irritable towards people	Trouble sleeping at night	Problems concentrating or making decision	loss of appetite	Feeling of guilt	Feeling anxious	Suicide attempt	Depressed
0	6/14/2022 20:02	35-40	Yes	Yes	Two or more days a week	Yes	Yes	No	Yes	Yes	Yes
1	6/14/2022 20:03	40-45	Yes	No	No	Yes	Yes	Yes	Yes	No	No
2	6/14/2022 20:04	35-40	Yes	No	Yes	Yes	Yes	No	Sometimes	No	Yes
3	6/14/2022 20:05	35-40	Yes	Yes	Yes	Yes	No	Maybe	No	No	Yes
4	6/14/2022 20:06	40-45	Yes	No	Two or more days a week	Yes	No	No	Yes	No	Yes
...	...	...	...	...	...	...	...	...	...	...	...
1498	6/15/2022 0:35	30-35	Yes	No	Two or more days a week	No	No	Maybe	Sometimes	No	Yes
1499	6/15/2022 0:35	25-30	Sometimes	No	No	Often	No	Maybe	Yes	No	Yes
1500	6/15/2022 0:35	25-30	No	Sometimes	Two or more days a week	No	No	Yes	No	Not interested to say	No
1501	6/15/2022 0:36	25-30	No	Sometimes	Yes	Often	No	No	No	No	Yes
1502	6/15/2022 0:36	45-50	Sometimes	Sometimes	Two or more days a week	No	No	Maybe	No	No	No

	Timestamp	Age	Feeling sad	Irritable towards people	Trouble sleeping at night	Problems concentrating or making decision	loss of appetite	Feeling of guilt	Feeling anxious	Suicide attempt	Depressed
count	1503	1503	1503	1497	1503	1491	1503	1494	1503	1503	1503
unique	90	5	3	3	3	3	3	3	3	3	2
top	6/15/2022 22:24	40-45	Yes	Yes	Two or more days a week	No	No	No	No	No	Yes
freq	51	364	536	555	640	583	841	624	557	709	980

	0
Feeling of guilt	0.249209
Irritable towards people	0.126162
loss of appetite	0.105880
Feeling anxious	0.099921
Problems concentrating or making decision	0.098232
Suicide attempt	0.088702
Age	0.080376
Trouble sleeping at night	0.076571
Feeling sad	0.074945