Implemented feature engineering
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 19 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
@@ -29,7 +29,7 @@ def main():
|
|||||||
X, y, test_size=0.2, stratify=y, random_state=0
|
X, y, test_size=0.2, stratify=y, random_state=0
|
||||||
)
|
)
|
||||||
|
|
||||||
# feature engineering
|
# pre training processing
|
||||||
X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
|
X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
|
||||||
|
|
||||||
# training
|
# training
|
||||||
@@ -40,9 +40,14 @@ def main():
|
|||||||
|
|
||||||
# evaluation
|
# evaluation
|
||||||
le = get_label_encoder(df_clean)
|
le = get_label_encoder(df_clean)
|
||||||
draw_feature_importance(model, X)
|
# draw_feature_importance(model, X)
|
||||||
draw_confusion_matrix(y_test, y_pred, le)
|
draw_confusion_matrix(y_test, y_pred, le)
|
||||||
draw_classification_report(y_test, y_pred, le)
|
draw_classification_report(y_test, y_pred, le)
|
||||||
|
evaluate_accuracy(y_test, y_pred)
|
||||||
|
|
||||||
|
def evaluate_accuracy(y_test, y_pred):
|
||||||
|
acc = accuracy_score(y_test, y_pred)
|
||||||
|
print(f"Model Accuracy: {acc:.4f}")
|
||||||
|
|
||||||
def get_label_encoder(df):
|
def get_label_encoder(df):
|
||||||
le = LabelEncoder()
|
le = LabelEncoder()
|
||||||
@@ -50,16 +55,35 @@ def get_label_encoder(df):
|
|||||||
return le
|
return le
|
||||||
|
|
||||||
def draw_classification_report(y_test, y_pred, le):
|
def draw_classification_report(y_test, y_pred, le):
|
||||||
report = classification_report(y_test, y_pred, output_dict=True, target_names=le.classes_)
|
report = classification_report(
|
||||||
|
y_test, y_pred, output_dict=True, target_names=le.classes_
|
||||||
|
)
|
||||||
df_report = pd.DataFrame(report).transpose()
|
df_report = pd.DataFrame(report).transpose()
|
||||||
|
|
||||||
df_report.loc[le.classes_, ["precision", "recall", "f1-score"]].plot(
|
metrics_df = df_report.loc[le.classes_, ["precision", "recall", "f1-score"]]
|
||||||
kind="bar", figsize=(8, 5), rot=0, color=["#4C72B0", "#55A868", "#C44E52"]
|
|
||||||
|
ax = metrics_df.plot(
|
||||||
|
kind="bar",
|
||||||
|
figsize=(8, 5),
|
||||||
|
rot=0,
|
||||||
|
color=["#4C72B0", "#55A868", "#C44E52"]
|
||||||
)
|
)
|
||||||
|
|
||||||
plt.title("Classification Report Metrics")
|
plt.title("Classification Report Metrics")
|
||||||
plt.ylabel("Score")
|
plt.ylabel("Score")
|
||||||
plt.ylim(0, 1)
|
plt.ylim(0, 1)
|
||||||
plt.legend(loc="lower right")
|
plt.legend(loc="lower right")
|
||||||
|
|
||||||
|
for p in ax.patches:
|
||||||
|
height = p.get_height()
|
||||||
|
ax.annotate(
|
||||||
|
f"{height:.2f}",
|
||||||
|
(p.get_x() + p.get_width() / 2, height),
|
||||||
|
ha='center',
|
||||||
|
va='bottom',
|
||||||
|
fontsize=9
|
||||||
|
)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
@@ -151,7 +175,7 @@ def remove_outliers(df):
|
|||||||
|
|
||||||
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
|
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
|
||||||
if len(numeric_cols) == 0:
|
if len(numeric_cols) == 0:
|
||||||
print("No numeric columns detected.")
|
# print("No numeric columns detected.")
|
||||||
return df_clean
|
return df_clean
|
||||||
|
|
||||||
mask = np.ones(len(df_clean), dtype=bool)
|
mask = np.ones(len(df_clean), dtype=bool)
|
||||||
@@ -169,7 +193,7 @@ def remove_outliers(df):
|
|||||||
|
|
||||||
df_clean = df_clean[mask]
|
df_clean = df_clean[mask]
|
||||||
|
|
||||||
print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.")
|
# print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.")
|
||||||
|
|
||||||
return df_clean
|
return df_clean
|
||||||
|
|
||||||
@@ -211,6 +235,9 @@ def draw_plots(df):
|
|||||||
def preprocess_data(df):
|
def preprocess_data(df):
|
||||||
#removing uneeded feature
|
#removing uneeded feature
|
||||||
df.drop("Student_ID", axis=1, inplace=True)
|
df.drop("Student_ID", axis=1, inplace=True)
|
||||||
|
df.drop("GPA", axis=1, inplace=True)
|
||||||
|
df.drop("Extracurricular_Hours_Per_Day", axis=1, inplace=True)
|
||||||
|
df.drop("Social_Hours_Per_Day", axis=1, inplace=True)
|
||||||
df_clean = clean_data(df)
|
df_clean = clean_data(df)
|
||||||
order_data_stress_level(df_clean)
|
order_data_stress_level(df_clean)
|
||||||
df_clean = remove_outliers(df_clean)
|
df_clean = remove_outliers(df_clean)
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
|
|||||||
**Figures:**
|
**Figures:**
|
||||||

|

|
||||||

|

|
||||||

|
|
||||||

|

|
||||||

|

|
||||||

|

|
||||||
@@ -46,3 +45,14 @@ No missing values or duplicate rows were found in the dataset. Outliers in numer
|
|||||||

|

|
||||||

|

|
||||||

|

|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Feature Engineering
|
||||||
|
|
||||||
|
To improve model performance and reduce redundancy, I performed feature engineering before training:
|
||||||
|
- **GPA** was removed because it was highly correlated with **study time**, reducing redundant information and potential multicollinearity.
|
||||||
|
- Features such as **extracurricular activity time** and **social time** were removed due to low predictive importance, minimizing noise and helping the model focus on the most relevant factors.
|
||||||
|
|
||||||
|

|
||||||
|

|
||||||
Reference in New Issue
Block a user