- Dataset Source : https://www.kaggle.com/datasets/sonalshinde123/work-from-home-employee-burnout-dataset/code
Work From Home Employee Burnout Dataset
Daily Work Patterns, Productivity & Burnout Risk
www.kaggle.com
Burnout Risk Classification || ML & DL
Explore and run machine learning code with Kaggle Notebooks | Using data from Work From Home Employee Burnout Dataset
www.kaggle.com
Import Libraries and Read Dataset
# For EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, RobustScaler
# For Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.exceptions import FitFailedWarning
# For Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
#--- read part ---#
df=pd.read_csv('/kaggle/input/work-from-home-employee-burnout-dataset/work_from_home_burnout_dataset.csv')
df.describe().T
df.describe(include='object').T


Data Preprocessing
데이터의 값들을 보면, 가장 먼저 해야할 건 첫번째로, 데이터의 균형이 맞는지와 내가 예측해야하는 종속변수에 방해가 되는 요소를 제거해야합니다.
1. 데이터의 중복치, 결측치, 아웃라이어 확인
print('Missing Value (%)')
print(df.isna().mean()*100)
print('\nDuplicate Row (%)')
print(df.duplicated().mean())

아웃라이러는 깜빡하고 확인하지 못했습니다;;
2. object형 데이터를 모델 학습을 위한 형태로 변환
df['day_type'] = df['day_type'].map({
'Weekday': 0,
'Weekend': 1
})
df['burnout_risk'] = df['burnout_risk'].map({
'Low': 0,
'Medium': 1,
'High': 2
})
순서의 정보가 필요하다고 판단되기에, 매핑해서 문자형 데이터를 숫자형으로 바꿨습니다.
원핫 인코딩을 하는 이유
머신러닝이나 딥러닝을 공부하다 보면 가장 먼저 마주치는 전처리 기법 중 하나가 원-핫 인코딩(One-Hot Encoding)입니다. 단순히 “범주형 데이터를 숫자로 바꾸는 방법”이라고만 외우기엔, 왜 이
whitecode2718.tistory.com
3. 데이터 정규화
각각의 열은 독립적인 수의 범위를 가지기 때문에 종속변수를 예측하기에는 문제가 발생합니다. 때문에 sclar를 통해 데이터의 범위를 정규화 시킵니다.
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
label_encoders[col] = LabelEncoder()
df[col] = label_encoders[col].fit_transform(df[col])
original_objects_cols = list(label_encoders.keys())
num_col = df.select_dtypes(include=['int64','float64']).columns
exclude_cols = ['burnout_risk', 'day_type']
num_col_scale = [col for col in num_col if col not in original_objects_cols and col not in exclude_cols]
scaler = StandardScaler()
df[num_col_scale] = scaler.fit_transform(df[num_col_scale])
df[num_col_scale].head()
4. 불필요한 칼럼 제거
user_id의 경우 굳이 상관도를 분석하지 않아도, burnout_risk와 무관하다고 판단하에 제거합니다.
df.drop(columns=['user_id'], inplace=True)
그 다음은 다중공선성 현상을 방지하기 위해 각각 칼럼끼리의 상관도를 분석해 줍니다.
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
label_encoders[column] = LabelEncoder()
df[column] = label_encoders[column].fit_transform(df[column])
correlation_matrix = df.corr(method='pearson')
plt.figure(figsize=(18, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix", fontsize=16)
plt.show()

screen_time_hours와 work_hours의 상관도가 매우 높아 다중공선성 현상이 우려되지만, 각각의 특성이 분명하기 때문에 이 둘의 정보를 중요하게 생각하여 굳이 drop하지 않겠습니다.
물론 종속변수 예측시, 성능이 현저히 떨어진다면 둘중 하나의 열을 제거한후 다시 분석하겠다는 전략을 세웠습니다.
Comparison of Machine Learning Model Performance
X = df.drop('burnout_risk', axis=1)
y = df['burnout_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
"Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
"KNN": KNeighborsClassifier(n_neighbors=3),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"XGBoost": XGBClassifier(n_estimators=100, random_state=42)
}
def evaluate_model(model, X_train, X_test, y_train, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
return accuracy, precision, recall, f1
results = []
for name, model in models.items():
accuracy, precision, recall, f1 = evaluate_model(model, X_train, X_test, y_train, y_test)
results.append({
"Model": name,
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1-Score": f1
})
results_df = pd.DataFrame(results)
results_df
독립변수(x)와 종속변수(y)를 지정한 후, 데이터셋을 구성하고 모델을 일괄로 평가하는 코드입니다. 분류문제이기 때문에 Accuarcy, Precision, Recall, F1-score를 평가지표로 사용했습니다.

전반적으로 상당히 높은 정확도를 보입니다. 또한 전체 정확도가 0.997222로 동일한걸 보면 confusion matrix의 결과가 동일하다고 보이고, 알고리즘으로 구현할수 있는 최대 장확도일수 있습니다.
Comparison of Deep Learning Model Performance
이제부터는 NN 딥러닝 모델을 설계하고 학습시키는 코드입니다.
def plot_history(history, main_title):
plt.figure(figsize=(12, 5))
plt.suptitle(main_title, fontsize=20, fontweight='bold')
plt.subplot(1, 2, 1)
plt.plot(history['accuracy'], label='Training Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy', fontsize=16)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history['loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Loss', fontsize=16)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend()
plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.show()
early_stopping = EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True
)
# NN Model - Multiclass (softmax)
num_classes = len(np.unique(y_train))
simple_nn = Sequential([
Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
Dense(32, activation='relu'),
Dense(num_classes, activation='softmax')
])
simple_nn.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
simple_nn.summary()
history = simple_nn.fit(
X_train, y_train,
epochs=100,
batch_size=16,
validation_data=(X_test, y_test),
callbacks=[early_stopping]
)
test_loss, test_accuracy = simple_nn.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")
y_pred = simple_nn.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")
plot_history(history.history, "NN (3-class Classification)")
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Low', 'Medium', 'High'],
yticklabels=['Low', 'Medium', 'High'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ANN Model
ann_model = Sequential([
Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(num_classes, activation='softmax')
])
ann_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
ann_model.summary()
history = ann_model.fit(
X_train, y_train,
epochs=100,
batch_size=16,
validation_data=(X_test, y_test),
callbacks=[early_stopping]
)
test_loss, test_accuracy = ann_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")
y_pred = ann_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")
plot_history(history.history, "ANN (3-class Classification)")
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Low', 'Medium', 'High'],
yticklabels=['Low', 'Medium', 'High'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# DNN Model
dnn_model = Sequential([
Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
BatchNormalization(),
Dropout(0.4),
Dense(64, activation='relu'),
BatchNormalization(),
Dropout(0.4),
Dense(32, activation='relu'),
Dense(num_classes, activation='softmax')
])
dnn_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
dnn_model.summary()
history = dnn_model.fit(
X_train, y_train,
epochs=100,
batch_size=16,
validation_data=(X_test, y_test),
callbacks=[early_stopping]
)
test_loss, test_accuracy = dnn_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")
y_pred = dnn_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")
plot_history(history.history, "DNN (3-class Classification)")
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Low', 'Medium', 'High'],
yticklabels=['Low', 'Medium', 'High'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# MLP Model
mlp_model = Sequential([
Dense(128, activation='sigmoid', input_shape=(X_train.shape[1],)),
Dropout(0.3),
Dense(64, activation='sigmoid'),
Dropout(0.3),
Dense(num_classes, activation='softmax')
])
mlp_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
mlp_model.summary()
history = mlp_model.fit(
X_train, y_train,
epochs=100,
batch_size=16,
validation_data=(X_test, y_test),
callbacks=[early_stopping]
)
test_loss, test_accuracy = mlp_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")
y_pred = mlp_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")
plot_history(history.history, "MLP (3-class Classification)")
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Low', 'Medium', 'High'],
yticklabels=['Low', 'Medium', 'High'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

X_train_np = X_train.values
X_test_np = X_test.values
X_train_reshaped = X_train_np.reshape((X_train_np.shape[0], X_train_np.shape[1], 1))
X_test_reshaped = X_test_np.reshape((X_test_np.shape[0], X_test_np.shape[1], 1))
# CNN Model
cnn_model = Sequential([
Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
MaxPooling1D(pool_size=1, strides=1),
Flatten(),
Dense(64, activation='relu'),
Dense(num_classes, activation='softmax')
])
cnn_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
cnn_model.summary()
history = cnn_model.fit(
X_train_reshaped, y_train,
epochs=100,
batch_size=16,
validation_data=(X_test_reshaped, y_test),
callbacks=[early_stopping]
)
test_loss, test_accuracy = cnn_model.evaluate(X_test_reshaped, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")
y_pred = cnn_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")
plot_history(history.history, "CNN (3-class Classification)")
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Low', 'Medium', 'High'],
yticklabels=['Low', 'Medium', 'High'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

def evaluate_model_on_test(model, X_test, y_test):
if len(X_test.shape) == 2:
X_test_input = X_test
else:
X_test_input = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
y_pred_prob = model.predict(X_test_input, verbose=0)
y_pred = np.argmax(y_pred_prob, axis=1)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
return accuracy, precision, recall, f1
results_deep = []
models_dict = {
"Simple NN": simple_nn,
"DNN" : dnn_model,
"ANN" : ann_model,
"MLP": mlp_model,
"CNN": cnn_model,
}
for model_name, model in models_dict.items():
acc, precision, recall, f1 = evaluate_model_on_test(model, X_test, y_test)
results_deep.append({
"Model": model_name,
"Accuracy": acc,
"Precision": precision,
"Recall": recall,
"F1-Score": f1
})
results_df2 = pd.DataFrame(results_deep)
results_df2

전체적인 성능은 ML과 유사함을 보입니다.
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest Model (Machine Learning):")
print(best_model)
metrics = ["Accuracy", "Precision", "Recall", "F1-Score"]
colors = ["skyblue", "orange", "green", "red"]
fig, ax = plt.subplots(figsize=(10, 6))
offset = 0.3
for i, metric in enumerate(metrics):
ax.barh(results_df['Model'] + f" ({metric})", results_df[metric], color=colors[i], alpha=0.7, label=metric, height=0.6)
ax.set_xlabel("Score", fontsize=12)
ax.set_title("Model Performance Comparison", fontsize=18)
ax.legend(title="Metrics")
plt.tight_layout()
plt.show()
print(f"\nOptimal Model based on Accuracy: {best_model['Model']} with Accuracy = {best_model['Accuracy']:.4f}")

결과적으로 최고의 성능은 Decision Tree로 보이지만, 사실상 ML은 거의 위와 같은 성능을 보이기 때문에 각 모델 끼리의 비교는 의미없습니다.
'Data Science > Kaggle 작업' 카테고리의 다른 글
| [머신 러닝] Temperature and Ice Cream Sales (선형 회귀 분석) (0) | 2025.11.21 |
|---|---|
| [데이터 시각화] US Stock Market 2020 to 2024 (피벗, 차트) (0) | 2025.11.20 |
| [데이터 시각화] Job placement dataset (피벗, 차트) (0) | 2025.11.19 |