ML
lightgbm_분류
yuurimingg
2024. 1. 23. 16:57
LightGBM¶
위스콘신 유방암 예측¶
In [1]:
# lightgbm에서 early_stopping_rounds를 사용하기 위해서는 버전을 '3.3.1', '3.3.2'로 변경
# !pip install lightgbm==3.3.2
import lightgbm
lightgbm.__version__
Out[1]:
'3.3.2'
In [28]:
# lightgbm의 파이썬 패키지인 lightgbm에서 LGBMClassifier 임포트
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
dataset = load_breast_cancer() # 데이터 불러오기
dataset.keys()
df = pd.DataFrame(data = dataset.data, columns = dataset.feature_names) # dataframe로 변경
df['target'] = dataset.target # 타겟 값 추가
X_features = df.iloc[:, :-1]
y_label = df.iloc[:, -1]
# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size = 0.2, random_state = 156)
# 위에서 만든 X_train, y_train을 다시 쪼개서90%는 학습과 10%는 검증용 데이터로 분리
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 156)
# 앞서 XGB와 동일하게 n_estimators는 400 설정
lgbm_wrapper = LGBMClassifier(n_estimators = 400, learning_rate = 0.05) # 모델 생성
# lgbm도 동일하게 조기 중단
evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm_wrapper.fit(X_tr, y_tr, early_stopping_rounds = 50, eval_metric = 'logloss',
eval_set = evals, verbose = True) # 학습
preds = lgbm_wrapper.predict(X_test) # 예측
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1] # 확률값 예측
Out[28]:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
[1] training's binary_logloss: 0.625671 valid_1's binary_logloss: 0.628248
[2] training's binary_logloss: 0.588173 valid_1's binary_logloss: 0.601106
[3] training's binary_logloss: 0.554518 valid_1's binary_logloss: 0.577587
[4] training's binary_logloss: 0.523972 valid_1's binary_logloss: 0.556324
[5] training's binary_logloss: 0.49615 valid_1's binary_logloss: 0.537407
[6] training's binary_logloss: 0.470108 valid_1's binary_logloss: 0.519401
[7] training's binary_logloss: 0.446647 valid_1's binary_logloss: 0.502637
[8] training's binary_logloss: 0.425055 valid_1's binary_logloss: 0.488311
[9] training's binary_logloss: 0.405125 valid_1's binary_logloss: 0.474664
[10] training's binary_logloss: 0.386526 valid_1's binary_logloss: 0.461267
[11] training's binary_logloss: 0.367027 valid_1's binary_logloss: 0.444274
[12] training's binary_logloss: 0.350713 valid_1's binary_logloss: 0.432755
[13] training's binary_logloss: 0.334601 valid_1's binary_logloss: 0.421371
[14] training's binary_logloss: 0.319854 valid_1's binary_logloss: 0.411418
[15] training's binary_logloss: 0.306374 valid_1's binary_logloss: 0.402989
[16] training's binary_logloss: 0.293116 valid_1's binary_logloss: 0.393973
[17] training's binary_logloss: 0.280812 valid_1's binary_logloss: 0.384801
[18] training's binary_logloss: 0.268352 valid_1's binary_logloss: 0.376191
[19] training's binary_logloss: 0.256942 valid_1's binary_logloss: 0.368378
[20] training's binary_logloss: 0.246443 valid_1's binary_logloss: 0.362062
[21] training's binary_logloss: 0.236874 valid_1's binary_logloss: 0.355162
[22] training's binary_logloss: 0.227501 valid_1's binary_logloss: 0.348933
[23] training's binary_logloss: 0.218988 valid_1's binary_logloss: 0.342819
[24] training's binary_logloss: 0.210621 valid_1's binary_logloss: 0.337386
[25] training's binary_logloss: 0.202076 valid_1's binary_logloss: 0.331523
[26] training's binary_logloss: 0.194199 valid_1's binary_logloss: 0.326349
[27] training's binary_logloss: 0.187107 valid_1's binary_logloss: 0.322785
[28] training's binary_logloss: 0.180535 valid_1's binary_logloss: 0.317877
[29] training's binary_logloss: 0.173834 valid_1's binary_logloss: 0.313928
[30] training's binary_logloss: 0.167198 valid_1's binary_logloss: 0.310105
[31] training's binary_logloss: 0.161229 valid_1's binary_logloss: 0.307107
[32] training's binary_logloss: 0.155494 valid_1's binary_logloss: 0.303837
[33] training's binary_logloss: 0.149125 valid_1's binary_logloss: 0.300315
[34] training's binary_logloss: 0.144045 valid_1's binary_logloss: 0.297816
[35] training's binary_logloss: 0.139341 valid_1's binary_logloss: 0.295387
[36] training's binary_logloss: 0.134625 valid_1's binary_logloss: 0.293063
[37] training's binary_logloss: 0.129167 valid_1's binary_logloss: 0.289127
[38] training's binary_logloss: 0.12472 valid_1's binary_logloss: 0.288697
[39] training's binary_logloss: 0.11974 valid_1's binary_logloss: 0.28576
[40] training's binary_logloss: 0.115054 valid_1's binary_logloss: 0.282853
[41] training's binary_logloss: 0.110662 valid_1's binary_logloss: 0.279441
[42] training's binary_logloss: 0.106358 valid_1's binary_logloss: 0.28113
[43] training's binary_logloss: 0.102324 valid_1's binary_logloss: 0.279139
[44] training's binary_logloss: 0.0985699 valid_1's binary_logloss: 0.276465
[45] training's binary_logloss: 0.094858 valid_1's binary_logloss: 0.275946
[46] training's binary_logloss: 0.0912486 valid_1's binary_logloss: 0.272819
[47] training's binary_logloss: 0.0883115 valid_1's binary_logloss: 0.272306
[48] training's binary_logloss: 0.0849963 valid_1's binary_logloss: 0.270452
[49] training's binary_logloss: 0.0821742 valid_1's binary_logloss: 0.268671
[50] training's binary_logloss: 0.0789991 valid_1's binary_logloss: 0.267587
[51] training's binary_logloss: 0.0761072 valid_1's binary_logloss: 0.26626
[52] training's binary_logloss: 0.0732567 valid_1's binary_logloss: 0.265542
[53] training's binary_logloss: 0.0706388 valid_1's binary_logloss: 0.264547
[54] training's binary_logloss: 0.0683911 valid_1's binary_logloss: 0.26502
[55] training's binary_logloss: 0.0659347 valid_1's binary_logloss: 0.264388
[56] training's binary_logloss: 0.0636873 valid_1's binary_logloss: 0.263128
[57] training's binary_logloss: 0.0613354 valid_1's binary_logloss: 0.26231
[58] training's binary_logloss: 0.0591944 valid_1's binary_logloss: 0.262011
[59] training's binary_logloss: 0.057033 valid_1's binary_logloss: 0.261454
[60] training's binary_logloss: 0.0550801 valid_1's binary_logloss: 0.260746
[61] training's binary_logloss: 0.0532381 valid_1's binary_logloss: 0.260236
[62] training's binary_logloss: 0.0514074 valid_1's binary_logloss: 0.261586
[63] training's binary_logloss: 0.0494837 valid_1's binary_logloss: 0.261797
[64] training's binary_logloss: 0.0477826 valid_1's binary_logloss: 0.262533
[65] training's binary_logloss: 0.0460364 valid_1's binary_logloss: 0.263305
[66] training's binary_logloss: 0.0444552 valid_1's binary_logloss: 0.264072
[67] training's binary_logloss: 0.0427638 valid_1's binary_logloss: 0.266223
[68] training's binary_logloss: 0.0412449 valid_1's binary_logloss: 0.266817
[69] training's binary_logloss: 0.0398589 valid_1's binary_logloss: 0.267819
[70] training's binary_logloss: 0.0383095 valid_1's binary_logloss: 0.267484
[71] training's binary_logloss: 0.0368803 valid_1's binary_logloss: 0.270233
[72] training's binary_logloss: 0.0355637 valid_1's binary_logloss: 0.268442
[73] training's binary_logloss: 0.0341747 valid_1's binary_logloss: 0.26895
[74] training's binary_logloss: 0.0328302 valid_1's binary_logloss: 0.266958
[75] training's binary_logloss: 0.0317853 valid_1's binary_logloss: 0.268091
[76] training's binary_logloss: 0.0305626 valid_1's binary_logloss: 0.266419
[77] training's binary_logloss: 0.0295001 valid_1's binary_logloss: 0.268588
[78] training's binary_logloss: 0.0284699 valid_1's binary_logloss: 0.270964
[79] training's binary_logloss: 0.0273953 valid_1's binary_logloss: 0.270293
[80] training's binary_logloss: 0.0264668 valid_1's binary_logloss: 0.270523
[81] training's binary_logloss: 0.0254636 valid_1's binary_logloss: 0.270683
[82] training's binary_logloss: 0.0245911 valid_1's binary_logloss: 0.273187
[83] training's binary_logloss: 0.0236486 valid_1's binary_logloss: 0.275994
[84] training's binary_logloss: 0.0228047 valid_1's binary_logloss: 0.274053
[85] training's binary_logloss: 0.0221693 valid_1's binary_logloss: 0.273211
[86] training's binary_logloss: 0.0213043 valid_1's binary_logloss: 0.272626
[87] training's binary_logloss: 0.0203934 valid_1's binary_logloss: 0.27534
[88] training's binary_logloss: 0.0195552 valid_1's binary_logloss: 0.276228
[89] training's binary_logloss: 0.0188623 valid_1's binary_logloss: 0.27525
[90] training's binary_logloss: 0.0183664 valid_1's binary_logloss: 0.276485
[91] training's binary_logloss: 0.0176788 valid_1's binary_logloss: 0.277052
[92] training's binary_logloss: 0.0170059 valid_1's binary_logloss: 0.277686
[93] training's binary_logloss: 0.0164317 valid_1's binary_logloss: 0.275332
[94] training's binary_logloss: 0.015878 valid_1's binary_logloss: 0.276236
[95] training's binary_logloss: 0.0152959 valid_1's binary_logloss: 0.274538
[96] training's binary_logloss: 0.0147216 valid_1's binary_logloss: 0.275244
[97] training's binary_logloss: 0.0141758 valid_1's binary_logloss: 0.275829
[98] training's binary_logloss: 0.0136551 valid_1's binary_logloss: 0.276654
[99] training's binary_logloss: 0.0131585 valid_1's binary_logloss: 0.277859
[100] training's binary_logloss: 0.0126961 valid_1's binary_logloss: 0.279265
[101] training's binary_logloss: 0.0122421 valid_1's binary_logloss: 0.276695
[102] training's binary_logloss: 0.0118067 valid_1's binary_logloss: 0.278488
[103] training's binary_logloss: 0.0113994 valid_1's binary_logloss: 0.278932
[104] training's binary_logloss: 0.0109799 valid_1's binary_logloss: 0.280997
[105] training's binary_logloss: 0.0105953 valid_1's binary_logloss: 0.281454
[106] training's binary_logloss: 0.0102381 valid_1's binary_logloss: 0.282058
[107] training's binary_logloss: 0.00986714 valid_1's binary_logloss: 0.279275
[108] training's binary_logloss: 0.00950998 valid_1's binary_logloss: 0.281427
[109] training's binary_logloss: 0.00915965 valid_1's binary_logloss: 0.280752
[110] training's binary_logloss: 0.00882581 valid_1's binary_logloss: 0.282152
[111] training's binary_logloss: 0.00850714 valid_1's binary_logloss: 0.280894
Out[28]:
LGBMClassifier(learning_rate=0.05, n_estimators=400)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(learning_rate=0.05, n_estimators=400)
In [ ]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def get_clf_eval(y_test, pred = None, pred_proba = None):
confusion = confusion_matrix(y_test, pred)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test , pred)
f1 = f1_score(y_test, pred)
# ROC-AUC 추가
roc_auc = roc_auc_score(y_test, pred_proba)
print('오차행렬')
print(confusion)
# ROC-AUC 출력
print('정확도 : {0:.4f}, 정밀도 : {1:.4f}, 재현율 : {2:.4f}, F1 : {3:.4f}, AUC : {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
In [29]:
get_clf_eval(y_test, preds, pred_proba) # 평가
오차행렬
[[34 3]
[ 2 75]]
정확도 : 0.9561, 정밀도 : 0.9615, 재현율 : 0.9740, F1 : 0.9677, AUC : 0.9877
In [36]:
# 피처 중요도 시각화
from lightgbm import plot_importance
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize = (10, 12))
plot_importance(lgbm_wrapper, ax = ax);