Split data and encoded target

This commit is contained in:
Drew Giffin
2025-10-19 17:37:58 -04:00
parent 441c121751
commit 3239fe916a
+27 -4
View File
@@ -2,23 +2,46 @@ import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
data_path = "student_lifestyle_dataset.csv" data_path = "student_lifestyle_dataset.csv"
def main(): def main():
#loading # loading
df = load_data() df = load_data()
#preprocessing # preprocessing
df_clean = preprocess_data(df) df_clean = preprocess_data(df)
#exploratory data analysis # exploratory data analysis
# draw_plots(df_clean) # draw_plots(df_clean)
#feature engineering # feature engineering
normalize_features(df_clean) normalize_features(df_clean)
# separate features and target
X = df_clean.drop('Stress_Level', axis=1)
y_raw = df_clean['Stress_Level']
# encode target
le = LabelEncoder()
y = le.fit_transform(y_raw)
# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y
)
# sanity check
print("Classes:", le.classes_)
print("y_train distribution:", pd.Series(y_train).value_counts(normalize=True))
print("y_test distribution:", pd.Series(y_test).value_counts(normalize=True))
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
def load_data(): def load_data():
df = pd.read_csv(data_path, encoding="ascii", delimiter=",") df = pd.read_csv(data_path, encoding="ascii", delimiter=",")
#removing uneeded feature #removing uneeded feature