diff --git a/main.py b/main.py
index bff6dbb..9487b81 100644
--- a/main.py
+++ b/main.py
@@ -2,23 +2,46 @@ import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
+
 from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
 
 data_path = "student_lifestyle_dataset.csv"
 
 def main():
-    #loading
+    # loading
     df = load_data()
     
-    #preprocessing
+    # preprocessing
     df_clean = preprocess_data(df)
     
-    #exploratory data analysis
+    # exploratory data analysis
     # draw_plots(df_clean)
     
-    #feature engineering
+    # feature engineering
     normalize_features(df_clean)
     
+    # separate features and target
+    X = df_clean.drop('Stress_Level', axis=1)
+    y_raw = df_clean['Stress_Level']
+        
+    # encode target
+    le = LabelEncoder()
+    y = le.fit_transform(y_raw)
+    
+    # split into train and test data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, stratify=y
+    )   
+    
+    # sanity check
+    print("Classes:", le.classes_)
+    print("y_train distribution:", pd.Series(y_train).value_counts(normalize=True))
+    print("y_test distribution:", pd.Series(y_test).value_counts(normalize=True))
+    print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
+    
+    
 def load_data():
     df = pd.read_csv(data_path, encoding="ascii", delimiter=",")
     #removing uneeded feature