The average accuracy is calculated after 1000 trainings
This commit is contained in:
Binary file not shown.
|
After Width: | Height: | Size: 1.4 KiB |
@@ -25,29 +25,35 @@ def main():
|
|||||||
X, y = separate_features_and_target(df_clean)
|
X, y = separate_features_and_target(df_clean)
|
||||||
|
|
||||||
# split into train and test data
|
# split into train and test data
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
accuracy_scores = []
|
||||||
X, y, test_size=0.2, stratify=y, random_state=0
|
# run training many times using different splits to get an average accuracy score
|
||||||
)
|
for i in range(1000):
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, y, test_size=0.2, stratify=y, random_state=i
|
||||||
|
)
|
||||||
|
|
||||||
# pre training processing
|
# pre training processing
|
||||||
X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
|
X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
|
||||||
|
|
||||||
# training
|
# training
|
||||||
model = train_logistic_regression(X_train_normalized, y_train)
|
model = train_logistic_regression(X_train_normalized, y_train)
|
||||||
|
|
||||||
# prediction
|
# prediction
|
||||||
y_pred = predict_target(model, X_test_normalized)
|
y_pred = predict_target(model, X_test_normalized)
|
||||||
|
|
||||||
# evaluation
|
# evaluation
|
||||||
le = get_label_encoder(df_clean)
|
le = get_label_encoder(df_clean)
|
||||||
# draw_feature_importance(model, X)
|
# draw_feature_importance(model, X)
|
||||||
# draw_confusion_matrix(y_test, y_pred, le)
|
# draw_confusion_matrix(y_test, y_pred, le)
|
||||||
# draw_classification_report(y_test, y_pred, le)
|
# draw_classification_report(y_test, y_pred, le)
|
||||||
evaluate_accuracy(y_test, y_pred)
|
accuracy = get_accuracy(y_test, y_pred)
|
||||||
|
accuracy_scores.append(accuracy)
|
||||||
|
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
|
||||||
|
print(f"Samples: {len(accuracy_scores)}")
|
||||||
|
|
||||||
def evaluate_accuracy(y_test, y_pred):
|
def get_accuracy(y_test, y_pred):
|
||||||
acc = accuracy_score(y_test, y_pred)
|
accuracy = accuracy_score(y_test, y_pred)
|
||||||
print(f"Model Accuracy: {acc:.4f}")
|
return accuracy
|
||||||
|
|
||||||
def get_label_encoder(df):
|
def get_label_encoder(df):
|
||||||
le = LabelEncoder()
|
le = LabelEncoder()
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ No missing values or duplicate rows were found in the dataset. Outliers in numer
|
|||||||
|
|
||||||

|

|
||||||

|

|
||||||

|

|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -60,4 +60,12 @@ To improve model performance and reduce redundancy, I performed feature engineer
|
|||||||
|
|
||||||
## Modeling
|
## Modeling
|
||||||
|
|
||||||
This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
|
This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|

|
||||||
|

|
||||||
|

|
||||||
Reference in New Issue
Block a user