Merge branch 'feature/exploratory-data-analysis' into development
@@ -0,0 +1 @@
|
||||
.venv
|
||||
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 27 KiB |
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 27 KiB |
|
After Width: | Height: | Size: 66 KiB |
|
After Width: | Height: | Size: 46 KiB |
|
After Width: | Height: | Size: 1.4 MiB |
@@ -0,0 +1,83 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
data_path = "student_lifestyle_dataset.csv"
|
||||
|
||||
def main():
|
||||
#loading
|
||||
df = load_data()
|
||||
|
||||
#preprocessing
|
||||
preprocess_data(df)
|
||||
|
||||
#exploratory data analysis
|
||||
draw_plots(df)
|
||||
|
||||
def load_data():
|
||||
df = pd.read_csv(data_path, encoding="ascii", delimiter=",")
|
||||
#removing uneeded feature
|
||||
df.drop("Student_ID", axis=1, inplace=True)
|
||||
return df
|
||||
|
||||
def inspect_data(df):
|
||||
print("Info:")
|
||||
print(df.info())
|
||||
print("\n")
|
||||
|
||||
print("Head:")
|
||||
print(df.head())
|
||||
print("\n")
|
||||
|
||||
print("Description:")
|
||||
print(df.describe(include="all"))
|
||||
print("\n")
|
||||
|
||||
def clean_data(df):
|
||||
print("Missing values:")
|
||||
print(df.isnull().sum())
|
||||
print("\n")
|
||||
|
||||
df.dropna(inplace=True)
|
||||
|
||||
def order_data_stress_level(df):
|
||||
df["Stress_Level"] = pd.Categorical(
|
||||
df["Stress_Level"],
|
||||
categories=["Low", "Moderate", "High"],
|
||||
ordered=True
|
||||
)
|
||||
|
||||
def display_feature_distributions_histogram(df):
|
||||
df.hist(bins=20, figsize=(10,8))
|
||||
plt.suptitle("Feature Distributions")
|
||||
plt.show()
|
||||
|
||||
def display_scatter_plot_matrix(df):
|
||||
sns.pairplot(df, hue="Stress_Level")
|
||||
plt.suptitle("Pair Plot of Numerical Features", y=1.02)
|
||||
plt.show()
|
||||
|
||||
def display_correlation_heatmap(df):
|
||||
corr = df.corr(numeric_only=True)
|
||||
sns.heatmap(corr, annot=True, cmap="coolwarm")
|
||||
plt.title("Correlation Heatmap")
|
||||
plt.show()
|
||||
|
||||
def display_feature_boxplots(df):
|
||||
for col in df.select_dtypes(include=[np.number]).columns:
|
||||
sns.boxplot(x="Stress_Level", y=col, data=df)
|
||||
plt.title(f"{col} by Stress Level")
|
||||
plt.show()
|
||||
|
||||
def draw_plots(df):
|
||||
display_feature_distributions_histogram(df)
|
||||
display_scatter_plot_matrix(df)
|
||||
display_correlation_heatmap(df)
|
||||
display_feature_boxplots(df)
|
||||
|
||||
def preprocess_data(df):
|
||||
clean_data(df)
|
||||
order_data_stress_level(df)
|
||||
|
||||
main()
|
||||
@@ -14,4 +14,25 @@ The main goal of this project is to predict a student's stress level, categorize
|
||||
|
||||
The dataset used in this project was sources from [Kaggle](https://www.kaggle.com/datasets/afnansaifafnan/study-habits-and-activities-of-students), containing information about time spent on daily activities, academic performance, and reported stress level. Each entry has figures like time spent studying, sleeping, exersizing, socializing, and participating in extracurricular activities, as well as GPA.
|
||||
|
||||
The target variable is the **stress level**, indicated as *low*, *moderate* or *high*. These features will allow us to explore the correlation between lifestyle patterns, academic performance and stress. They will also help us build a predictive model capable of identifying which habits most strongly influence stress.
|
||||
The target variable is the **stress level**, indicated as *low*, *moderate* or *high*. These features will allow us to explore the correlation between lifestyle patterns, academic performance and stress. They will also help us build a predictive model capable of identifying which habits most strongly influence stress.
|
||||
|
||||
---
|
||||
|
||||
## Exploratory Data Anaysis
|
||||
|
||||
**Key Insights:**
|
||||
- Students who study more are more likely to have a higher GPA and more stress.
|
||||
- Physical activity has a negative correlation with other activities, one being study and therefore stress.
|
||||
- Students who sleep more were less likely to be very stressed.
|
||||
- No extreme outliers were observed.
|
||||
|
||||
**Figures:**
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||

|
||||