In [1]:
import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv("depressionData.csv")
df
Out[2]:
Timestamp Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 6/14/2022 20:02 35-40 Yes Yes Two or more days a week Yes Yes No Yes Yes Yes
1 6/14/2022 20:03 40-45 Yes No No Yes Yes Yes Yes No No
2 6/14/2022 20:04 35-40 Yes No Yes Yes Yes No Sometimes No Yes
3 6/14/2022 20:05 35-40 Yes Yes Yes Yes No Maybe No No Yes
4 6/14/2022 20:06 40-45 Yes No Two or more days a week Yes No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ... ...
1498 6/15/2022 0:35 30-35 Yes No Two or more days a week No No Maybe Sometimes No Yes
1499 6/15/2022 0:35 25-30 Sometimes No No Often No Maybe Yes No Yes
1500 6/15/2022 0:35 25-30 No Sometimes Two or more days a week No No Yes No Not interested to say No
1501 6/15/2022 0:36 25-30 No Sometimes Yes Often No No No No Yes
1502 6/15/2022 0:36 45-50 Sometimes Sometimes Two or more days a week No No Maybe No No No

1503 rows × 11 columns

In [3]:
df.shape
Out[3]:
(1503, 11)
In [4]:
df.head()
Out[4]:
Timestamp Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 6/14/2022 20:02 35-40 Yes Yes Two or more days a week Yes Yes No Yes Yes Yes
1 6/14/2022 20:03 40-45 Yes No No Yes Yes Yes Yes No No
2 6/14/2022 20:04 35-40 Yes No Yes Yes Yes No Sometimes No Yes
3 6/14/2022 20:05 35-40 Yes Yes Yes Yes No Maybe No No Yes
4 6/14/2022 20:06 40-45 Yes No Two or more days a week Yes No No Yes No Yes
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 11 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   Timestamp                                  1503 non-null   object
 1   Age                                        1503 non-null   object
 2   Feeling sad                                1503 non-null   object
 3   Irritable towards people                   1497 non-null   object
 4   Trouble sleeping at night                  1503 non-null   object
 5   Problems concentrating or making decision  1491 non-null   object
 6   loss of appetite                           1503 non-null   object
 7   Feeling of guilt                           1494 non-null   object
 8   Feeling anxious                            1503 non-null   object
 9   Suicide attempt                            1503 non-null   object
 10  Depressed                                  1503 non-null   object
dtypes: object(11)
memory usage: 129.3+ KB
In [6]:
df.describe()
Out[6]:
Timestamp Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
count 1503 1503 1503 1497 1503 1491 1503 1494 1503 1503 1503
unique 90 5 3 3 3 3 3 3 3 3 2
top 6/15/2022 22:24 40-45 Yes Yes Two or more days a week No No No No No Yes
freq 51 364 536 555 640 583 841 624 557 709 980

Data Cleaning¶

In [7]:
df.isnull().sum()
Out[7]:
Timestamp                                     0
Age                                           0
Feeling sad                                   0
Irritable towards people                      6
Trouble sleeping at night                     0
Problems concentrating or making decision    12
loss of appetite                              0
Feeling of guilt                              9
Feeling anxious                               0
Suicide attempt                               0
Depressed                                     0
dtype: int64
In [8]:
#filling all the missing values with the next available values
df['Irritable towards people'] = df['Irritable towards people'].fillna(method = 'bfill')
df['Problems concentrating or making decision'] = df['Problems concentrating or making decision'].fillna(method = 'bfill')
df['Feeling of guilt'] = df['Feeling of guilt'].fillna(method = 'bfill')
In [9]:
df.isnull().sum()
Out[9]:
Timestamp                                    0
Age                                          0
Feeling sad                                  0
Irritable towards people                     0
Trouble sleeping at night                    0
Problems concentrating or making decision    0
loss of appetite                             0
Feeling of guilt                             0
Feeling anxious                              0
Suicide attempt                              0
Depressed                                    0
dtype: int64
In [10]:
df = df.dropna()
In [11]:
df.shape
Out[11]:
(1503, 11)
In [12]:
df
Out[12]:
Timestamp Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 6/14/2022 20:02 35-40 Yes Yes Two or more days a week Yes Yes No Yes Yes Yes
1 6/14/2022 20:03 40-45 Yes No No Yes Yes Yes Yes No No
2 6/14/2022 20:04 35-40 Yes No Yes Yes Yes No Sometimes No Yes
3 6/14/2022 20:05 35-40 Yes Yes Yes Yes No Maybe No No Yes
4 6/14/2022 20:06 40-45 Yes No Two or more days a week Yes No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ... ...
1498 6/15/2022 0:35 30-35 Yes No Two or more days a week No No Maybe Sometimes No Yes
1499 6/15/2022 0:35 25-30 Sometimes No No Often No Maybe Yes No Yes
1500 6/15/2022 0:35 25-30 No Sometimes Two or more days a week No No Yes No Not interested to say No
1501 6/15/2022 0:36 25-30 No Sometimes Yes Often No No No No Yes
1502 6/15/2022 0:36 45-50 Sometimes Sometimes Two or more days a week No No Maybe No No No

1503 rows × 11 columns

In [13]:
df = df.drop(['Timestamp'], axis = 1)
df
Out[13]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 Yes Yes Two or more days a week Yes Yes No Yes Yes Yes
1 40-45 Yes No No Yes Yes Yes Yes No No
2 35-40 Yes No Yes Yes Yes No Sometimes No Yes
3 35-40 Yes Yes Yes Yes No Maybe No No Yes
4 40-45 Yes No Two or more days a week Yes No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 Yes No Two or more days a week No No Maybe Sometimes No Yes
1499 25-30 Sometimes No No Often No Maybe Yes No Yes
1500 25-30 No Sometimes Two or more days a week No No Yes No Not interested to say No
1501 25-30 No Sometimes Yes Often No No No No Yes
1502 45-50 Sometimes Sometimes Two or more days a week No No Maybe No No No

1503 rows × 10 columns

Data Visualization¶

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
In [15]:
df["Trouble sleeping at night"].value_counts()
Out[15]:
Two or more days a week    640
Yes                        445
No                         418
Name: Trouble sleeping at night, dtype: int64
In [16]:
df["Feeling sad"].value_counts()
Out[16]:
Yes          536
No           524
Sometimes    443
Name: Feeling sad, dtype: int64
In [17]:
df["Irritable towards people"].value_counts()
Out[17]:
Yes          561
No           499
Sometimes    443
Name: Irritable towards people, dtype: int64
In [18]:
df["Problems concentrating or making decision"].value_counts()
Out[18]:
No       583
Often    473
Yes      447
Name: Problems concentrating or making decision, dtype: int64
In [19]:
df["loss of appetite"].value_counts()
Out[19]:
No            841
Yes           343
Not at all    319
Name: loss of appetite, dtype: int64
In [20]:
df["Feeling of guilt"].value_counts()
Out[20]:
No       633
Maybe    528
Yes      342
Name: Feeling of guilt, dtype: int64
In [21]:
df["Feeling anxious"].value_counts()
Out[21]:
No           557
Sometimes    542
Yes          404
Name: Feeling anxious, dtype: int64
In [22]:
df["Suicide attempt"].value_counts()
Out[22]:
No                       709
Yes                      459
Not interested to say    335
Name: Suicide attempt, dtype: int64
In [23]:
df["Depressed"].value_counts()
Out[23]:
Yes    980
No     523
Name: Depressed, dtype: int64
In [24]:
# Iterate through all columns
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column is categorical
        counts = df[col].value_counts()  # Get value counts of column
        counts.plot.bar(color=['red', 'blue', 'green', 'maroon', 'purple'])  # Generate a bar plot of value counts
        plt.title(col)  # Set the title of the plot to the column name
        plt.show()  # Show the plot
In [25]:
df['Feeling sad'].unique()
Out[25]:
array(['Yes', 'No', 'Sometimes'], dtype=object)
In [26]:
ordinal=['Yes', 'No', 'Sometimes']

Data Preprocessing¶

In [27]:
from sklearn.preprocessing import OrdinalEncoder
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Feeling sad']])
Out[27]:
OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])
In [28]:
final=pd.DataFrame(ord.fit_transform(df[['Feeling sad']]))
final
Out[28]:
0
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
... ...
1498 0.0
1499 2.0
1500 1.0
1501 1.0
1502 2.0

1503 rows × 1 columns

In [29]:
df['Feeling sad']=final
df
Out[29]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 Yes Two or more days a week Yes Yes No Yes Yes Yes
1 40-45 0.0 No No Yes Yes Yes Yes No No
2 35-40 0.0 No Yes Yes Yes No Sometimes No Yes
3 35-40 0.0 Yes Yes Yes No Maybe No No Yes
4 40-45 0.0 No Two or more days a week Yes No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 No Two or more days a week No No Maybe Sometimes No Yes
1499 25-30 2.0 No No Often No Maybe Yes No Yes
1500 25-30 1.0 Sometimes Two or more days a week No No Yes No Not interested to say No
1501 25-30 1.0 Sometimes Yes Often No No No No Yes
1502 45-50 2.0 Sometimes Two or more days a week No No Maybe No No No

1503 rows × 10 columns

In [30]:
df['Irritable towards people'].unique()
Out[30]:
array(['Yes', 'No', 'Sometimes'], dtype=object)
In [31]:
ordinal=['Yes', 'No', 'Sometimes']
In [32]:
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Irritable towards people']])
Out[32]:
OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])
In [33]:
OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])
Out[33]:
OrdinalEncoder(categories=[['Yes', 'No', 'Sometimes']])
In [34]:
final2=pd.DataFrame(ord.fit_transform(df[['Irritable towards people']]))
final2
Out[34]:
0
0 0.0
1 1.0
2 1.0
3 0.0
4 1.0
... ...
1498 1.0
1499 1.0
1500 2.0
1501 2.0
1502 2.0

1503 rows × 1 columns

In [35]:
df['Irritable towards people']=final2
df
Out[35]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 0.0 Two or more days a week Yes Yes No Yes Yes Yes
1 40-45 0.0 1.0 No Yes Yes Yes Yes No No
2 35-40 0.0 1.0 Yes Yes Yes No Sometimes No Yes
3 35-40 0.0 0.0 Yes Yes No Maybe No No Yes
4 40-45 0.0 1.0 Two or more days a week Yes No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 1.0 Two or more days a week No No Maybe Sometimes No Yes
1499 25-30 2.0 1.0 No Often No Maybe Yes No Yes
1500 25-30 1.0 2.0 Two or more days a week No No Yes No Not interested to say No
1501 25-30 1.0 2.0 Yes Often No No No No Yes
1502 45-50 2.0 2.0 Two or more days a week No No Maybe No No No

1503 rows × 10 columns

In [36]:
df['Trouble sleeping at night'].unique()
Out[36]:
array(['Two or more days a week', 'No', 'Yes'], dtype=object)
In [37]:
ordinal=['Two or more days a week', 'No', 'Yes']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Trouble sleeping at night']])
final3=pd.DataFrame(ord.fit_transform(df[['Trouble sleeping at night']]))
final3
Out[37]:
0
0 0.0
1 1.0
2 2.0
3 2.0
4 0.0
... ...
1498 0.0
1499 1.0
1500 0.0
1501 2.0
1502 0.0

1503 rows × 1 columns

In [38]:
df['Trouble sleeping at night']=final3
df
Out[38]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 0.0 0.0 Yes Yes No Yes Yes Yes
1 40-45 0.0 1.0 1.0 Yes Yes Yes Yes No No
2 35-40 0.0 1.0 2.0 Yes Yes No Sometimes No Yes
3 35-40 0.0 0.0 2.0 Yes No Maybe No No Yes
4 40-45 0.0 1.0 0.0 Yes No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 1.0 0.0 No No Maybe Sometimes No Yes
1499 25-30 2.0 1.0 1.0 Often No Maybe Yes No Yes
1500 25-30 1.0 2.0 0.0 No No Yes No Not interested to say No
1501 25-30 1.0 2.0 2.0 Often No No No No Yes
1502 45-50 2.0 2.0 0.0 No No Maybe No No No

1503 rows × 10 columns

In [39]:
df['Problems concentrating or making decision'].unique()
Out[39]:
array(['Yes', 'No', 'Often'], dtype=object)
In [40]:
ordinal=['Yes', 'No', 'Often']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Problems concentrating or making decision']])
final4=pd.DataFrame(ord.fit_transform(df[['Problems concentrating or making decision']]))
final4
Out[40]:
0
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
... ...
1498 1.0
1499 2.0
1500 1.0
1501 2.0
1502 1.0

1503 rows × 1 columns

In [41]:
df['Problems concentrating or making decision']=final4
df
Out[41]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 0.0 0.0 0.0 Yes No Yes Yes Yes
1 40-45 0.0 1.0 1.0 0.0 Yes Yes Yes No No
2 35-40 0.0 1.0 2.0 0.0 Yes No Sometimes No Yes
3 35-40 0.0 0.0 2.0 0.0 No Maybe No No Yes
4 40-45 0.0 1.0 0.0 0.0 No No Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 1.0 0.0 1.0 No Maybe Sometimes No Yes
1499 25-30 2.0 1.0 1.0 2.0 No Maybe Yes No Yes
1500 25-30 1.0 2.0 0.0 1.0 No Yes No Not interested to say No
1501 25-30 1.0 2.0 2.0 2.0 No No No No Yes
1502 45-50 2.0 2.0 0.0 1.0 No Maybe No No No

1503 rows × 10 columns

In [42]:
df['loss of appetite'].unique()
Out[42]:
array(['Yes', 'No', 'Not at all'], dtype=object)
In [43]:
ordinal=['Yes', 'No', 'Not at all']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['loss of appetite']])
final5=pd.DataFrame(ord.fit_transform(df[['loss of appetite']]))
final5
Out[43]:
0
0 0.0
1 0.0
2 0.0
3 1.0
4 1.0
... ...
1498 1.0
1499 1.0
1500 1.0
1501 1.0
1502 1.0

1503 rows × 1 columns

In [44]:
df['loss of appetite']=final5
df
Out[44]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 0.0 0.0 0.0 0.0 No Yes Yes Yes
1 40-45 0.0 1.0 1.0 0.0 0.0 Yes Yes No No
2 35-40 0.0 1.0 2.0 0.0 0.0 No Sometimes No Yes
3 35-40 0.0 0.0 2.0 0.0 1.0 Maybe No No Yes
4 40-45 0.0 1.0 0.0 0.0 1.0 No Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 1.0 0.0 1.0 1.0 Maybe Sometimes No Yes
1499 25-30 2.0 1.0 1.0 2.0 1.0 Maybe Yes No Yes
1500 25-30 1.0 2.0 0.0 1.0 1.0 Yes No Not interested to say No
1501 25-30 1.0 2.0 2.0 2.0 1.0 No No No Yes
1502 45-50 2.0 2.0 0.0 1.0 1.0 Maybe No No No

1503 rows × 10 columns

In [45]:
df['Feeling of guilt'].unique()
Out[45]:
array(['No', 'Yes', 'Maybe'], dtype=object)
In [46]:
ordinal=['No', 'Yes', 'Maybe']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Feeling of guilt']])
Out[46]:
OrdinalEncoder(categories=[['No', 'Yes', 'Maybe']])
In [47]:
final6=pd.DataFrame(ord.fit_transform(df[['Feeling of guilt']]))
final6
Out[47]:
0
0 0.0
1 1.0
2 0.0
3 2.0
4 0.0
... ...
1498 2.0
1499 2.0
1500 1.0
1501 0.0
1502 2.0

1503 rows × 1 columns

In [48]:
df['Feeling of guilt']=final6
df
Out[48]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes Yes
1 40-45 0.0 1.0 1.0 0.0 0.0 1.0 Yes No No
2 35-40 0.0 1.0 2.0 0.0 0.0 0.0 Sometimes No Yes
3 35-40 0.0 0.0 2.0 0.0 1.0 2.0 No No Yes
4 40-45 0.0 1.0 0.0 0.0 1.0 0.0 Yes No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 1.0 0.0 1.0 1.0 2.0 Sometimes No Yes
1499 25-30 2.0 1.0 1.0 2.0 1.0 2.0 Yes No Yes
1500 25-30 1.0 2.0 0.0 1.0 1.0 1.0 No Not interested to say No
1501 25-30 1.0 2.0 2.0 2.0 1.0 0.0 No No Yes
1502 45-50 2.0 2.0 0.0 1.0 1.0 2.0 No No No

1503 rows × 10 columns

In [49]:
df['Feeling anxious'].unique()
Out[49]:
array(['Yes', 'Sometimes', 'No'], dtype=object)
In [50]:
ordinal=['Yes', 'Sometimes', 'No']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Feeling anxious']])
final7=pd.DataFrame(ord.fit_transform(df[['Feeling anxious']]))
final7
Out[50]:
0
0 0.0
1 0.0
2 1.0
3 2.0
4 0.0
... ...
1498 1.0
1499 0.0
1500 2.0
1501 2.0
1502 2.0

1503 rows × 1 columns

In [51]:
df['Feeling anxious']=final7
df
Out[51]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 35-40 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes
1 40-45 0.0 1.0 1.0 0.0 0.0 1.0 0.0 No No
2 35-40 0.0 1.0 2.0 0.0 0.0 0.0 1.0 No Yes
3 35-40 0.0 0.0 2.0 0.0 1.0 2.0 2.0 No Yes
4 40-45 0.0 1.0 0.0 0.0 1.0 0.0 0.0 No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 30-35 0.0 1.0 0.0 1.0 1.0 2.0 1.0 No Yes
1499 25-30 2.0 1.0 1.0 2.0 1.0 2.0 0.0 No Yes
1500 25-30 1.0 2.0 0.0 1.0 1.0 1.0 2.0 Not interested to say No
1501 25-30 1.0 2.0 2.0 2.0 1.0 0.0 2.0 No Yes
1502 45-50 2.0 2.0 0.0 1.0 1.0 2.0 2.0 No No

1503 rows × 10 columns

In [52]:
df['Age'].unique()
Out[52]:
array(['35-40', '40-45', '30-35', '45-50', '25-30'], dtype=object)
In [53]:
ordinal=['35-40', '40-45', '30-35', '45-50', '25-30']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Age']])
final8=pd.DataFrame(ord.fit_transform(df[['Age']]))
final8
Out[53]:
0
0 0.0
1 1.0
2 0.0
3 0.0
4 1.0
... ...
1498 2.0
1499 4.0
1500 4.0
1501 4.0
1502 3.0

1503 rows × 1 columns

In [54]:
df['Age']=final8
df
Out[54]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Yes Yes
1 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 No No
2 0.0 0.0 1.0 2.0 0.0 0.0 0.0 1.0 No Yes
3 0.0 0.0 0.0 2.0 0.0 1.0 2.0 2.0 No Yes
4 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 No Yes
... ... ... ... ... ... ... ... ... ... ...
1498 2.0 0.0 1.0 0.0 1.0 1.0 2.0 1.0 No Yes
1499 4.0 2.0 1.0 1.0 2.0 1.0 2.0 0.0 No Yes
1500 4.0 1.0 2.0 0.0 1.0 1.0 1.0 2.0 Not interested to say No
1501 4.0 1.0 2.0 2.0 2.0 1.0 0.0 2.0 No Yes
1502 3.0 2.0 2.0 0.0 1.0 1.0 2.0 2.0 No No

1503 rows × 10 columns

In [55]:
df['Suicide attempt'].unique()
Out[55]:
array(['Yes', 'No', 'Not interested to say'], dtype=object)
In [56]:
ordinal=['Yes', 'No', 'Not interested to say']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Suicide attempt']])
final9=pd.DataFrame(ord.fit_transform(df[['Suicide attempt']]))
final9
Out[56]:
0
0 0.0
1 1.0
2 1.0
3 1.0
4 1.0
... ...
1498 1.0
1499 1.0
1500 2.0
1501 1.0
1502 1.0

1503 rows × 1 columns

In [57]:
df['Suicide attempt']=final9
df
Out[57]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Yes
1 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 1.0 No
2 0.0 0.0 1.0 2.0 0.0 0.0 0.0 1.0 1.0 Yes
3 0.0 0.0 0.0 2.0 0.0 1.0 2.0 2.0 1.0 Yes
4 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 Yes
... ... ... ... ... ... ... ... ... ... ...
1498 2.0 0.0 1.0 0.0 1.0 1.0 2.0 1.0 1.0 Yes
1499 4.0 2.0 1.0 1.0 2.0 1.0 2.0 0.0 1.0 Yes
1500 4.0 1.0 2.0 0.0 1.0 1.0 1.0 2.0 2.0 No
1501 4.0 1.0 2.0 2.0 2.0 1.0 0.0 2.0 1.0 Yes
1502 3.0 2.0 2.0 0.0 1.0 1.0 2.0 2.0 1.0 No

1503 rows × 10 columns

In [58]:
df['Depressed'].unique()
Out[58]:
array(['Yes', 'No'], dtype=object)
In [59]:
ordinal=['Yes', 'No']
ord=OrdinalEncoder(categories=[ordinal])
ord.fit(df[['Depressed']])
final10=pd.DataFrame(ord.fit_transform(df[['Depressed']]))
final10
Out[59]:
0
0 0.0
1 1.0
2 0.0
3 0.0
4 0.0
... ...
1498 0.0
1499 0.0
1500 1.0
1501 0.0
1502 1.0

1503 rows × 1 columns

In [60]:
df['Depressed']=final10
df
Out[60]:
Age Feeling sad Irritable towards people Trouble sleeping at night Problems concentrating or making decision loss of appetite Feeling of guilt Feeling anxious Suicide attempt Depressed
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0
2 0.0 0.0 1.0 2.0 0.0 0.0 0.0 1.0 1.0 0.0
3 0.0 0.0 0.0 2.0 0.0 1.0 2.0 2.0 1.0 0.0
4 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ...
1498 2.0 0.0 1.0 0.0 1.0 1.0 2.0 1.0 1.0 0.0
1499 4.0 2.0 1.0 1.0 2.0 1.0 2.0 0.0 1.0 0.0
1500 4.0 1.0 2.0 0.0 1.0 1.0 1.0 2.0 2.0 1.0
1501 4.0 1.0 2.0 2.0 2.0 1.0 0.0 2.0 1.0 0.0
1502 3.0 2.0 2.0 0.0 1.0 1.0 2.0 2.0 1.0 1.0

1503 rows × 10 columns

In [61]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, ax=ax, cmap="YlGnBu" )
Out[61]:
<AxesSubplot:>
In [62]:
'''from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
x=df[df.columns[:9]]
y=df["Depressed"]
clf.fit(x,y)
feature_imp = pd.DataFrame(clf.feature_importances_,index=x.columns)
feature_imp.sort_values(by = 0 , ascending = False)'''
Out[62]:
'from sklearn.ensemble import RandomForestClassifier\nclf = RandomForestClassifier()\nx=df[df.columns[:9]]\ny=df["Depressed"]\nclf.fit(x,y)\nfeature_imp = pd.DataFrame(clf.feature_importances_,index=x.columns)\nfeature_imp.sort_values(by = 0 , ascending = False)'

Splitting Data¶

splitting data into dependent and independent variables

In [63]:
X= df[["Irritable towards people",'Trouble sleeping at night','Feeling sad','Feeling of guilt','Feeling anxious',"Problems concentrating or making decision","Suicide attempt"]]
y =df["Depressed"]
In [64]:
X
Out[64]:
Irritable towards people Trouble sleeping at night Feeling sad Feeling of guilt Feeling anxious Problems concentrating or making decision Suicide attempt
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1.0 1.0 0.0 1.0 0.0 0.0 1.0
2 1.0 2.0 0.0 0.0 1.0 0.0 1.0
3 0.0 2.0 0.0 2.0 2.0 0.0 1.0
4 1.0 0.0 0.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ...
1498 1.0 0.0 0.0 2.0 1.0 1.0 1.0
1499 1.0 1.0 2.0 2.0 0.0 2.0 1.0
1500 2.0 0.0 1.0 1.0 2.0 1.0 2.0
1501 2.0 2.0 1.0 0.0 2.0 2.0 1.0
1502 2.0 0.0 2.0 2.0 2.0 1.0 1.0

1503 rows × 7 columns

In [65]:
y
Out[65]:
0       0.0
1       1.0
2       0.0
3       0.0
4       0.0
       ... 
1498    0.0
1499    0.0
1500    1.0
1501    0.0
1502    1.0
Name: Depressed, Length: 1503, dtype: float64
In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
In [67]:
clf = RandomForestClassifier()
x=df[df.columns[:9]]
y=df["Depressed"]
clf.fit(x,y)
feature_imp = pd.DataFrame(clf.feature_importances_,index=x.columns)
feature_imp.sort_values(by = 0 , ascending = False)
Out[67]:
0
Feeling of guilt 0.249209
Irritable towards people 0.126162
loss of appetite 0.105880
Feeling anxious 0.099921
Problems concentrating or making decision 0.098232
Suicide attempt 0.088702
Age 0.080376
Trouble sleeping at night 0.076571
Feeling sad 0.074945

Spitting the data into train and test

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Logistic Regression Classifier¶

In [69]:
# Fit a Logistic Regression Classifier
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)

# Predict using the Logistic Regression Classifier
y_pred_lr = lr.predict(X_test)

# Calculate the accuracy of the Logistic Regression Classifier
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_lr
Out[69]:
0.7807308970099668

Decision Teee Classifier¶

In [70]:
# Fit a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

# Predict using the Decision Tree Classifier
y_pred_dt = dt.predict(X_test)

# Calculate the accuracy of the Decision Tree Classifier
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_dt
Out[70]:
0.9501661129568106

Random Forest Classifier¶

In [71]:
# Fit a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

# Predict using the Random Forest Classifier
y_pred_rf = rf.predict(X_test)

# Calculate the accuracy of the Random Forest Classifier
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_rf
Out[71]:
0.9501661129568106

K-Nearest Neighbour Classifier¶

In [72]:
# Fit a K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict using the K-Nearest Neighbors Classifier
y_pred_knn = knn.predict(X_test)

# Calculate the accuracy of the K-Nearest Neighbors Classifier
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_knn
Out[72]:
0.8837209302325582

Support Vector Machine Classifier¶

In [73]:
# Fit a Support Vector Machine Classifier
svm = SVC(random_state=0)
svm.fit(X_train, y_train)

# Predict using the Support Vector Machine Classifier
y_pred_svm = svm.predict(X_test)

# Calculate the accuracy of the Support Vector Machine Classifier
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_svm
Out[73]:
0.8770764119601329

Accurcy of all the classifiers in descending order

In [74]:
models = {
    "Random Forest": accuracy_rf,
    "Logistic Regression": accuracy_lr,
    "Decision Tree": accuracy_dt,
    "K-Nearest Neighbors": accuracy_knn,
    "Support Vector Machine": accuracy_svm
}

# Sort the models by accuracy
sorted_models = {k: v for k, v in sorted(models.items(), key=lambda item: item[1], reverse=True)}

# Print the accuracy of each model
for k, v in sorted_models.items():
    print(f"{k}: {v:.2f}")
Random Forest: 0.95
Decision Tree: 0.95
K-Nearest Neighbors: 0.88
Support Vector Machine: 0.88
Logistic Regression: 0.78

Confusion Matrix¶

In [75]:
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred_dt)

# Plot the confusion matrix as a heatmap
sns.heatmap(cm, annot=True, fmt="d")

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Decision Tree Classifier")
plt.show()
In [76]:
git
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-76-b40508769cb0> in <module>
----> 1 git

NameError: name 'git' is not defined
In [ ]: