gunnwu 2023. 5. 29. 23:04

ML활용을 위한 copysheet 

#분류문제
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import *

#회귀문제
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor

from xgboost import XGBRFRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import *

#데이터 기본정보 확인
df = pd.read_csv('https://raw.githubusercontent.com/jangrae/csv/master/Attrition_simple2.csv',encoding='cp949')
df.head(2)
df.info()
df.isna().sum()
df.shape
df.columns
df.tail(2)

#데이터 준비
df =df.drop('EmployeeNumber',axis=1)
target = 'Attrition'
x = df.drop(target,axis=1)
y = df.loc[:,target]
x = pd.get_dummies(x,['Gender','MaritalStatus','OverTime'], drop_first=True)
x.tail(2)

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state=1)

scaler = MinMaxScaler()
scaler.fit(x_train)

x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

#cv_score을 활용한 성능예측
#로지스틱
model_dt = LogisticRegression()
cv_score = cross_val_score(model_dt, x_train,y_train,cv=5)
print(cv_score.mean())
#결과수집
result={}
result['Logistic'] = cv_score.mean()

#knn
model_dt = KNeighborsClassifier()
cv_score = cross_val_score(model_dt, x_train_s,y_train, cv=5)
print(cv_score.mean())
result['KNN'] = cv_score.mean()


#랜덤포레스트
model_dt = RandomForestClassifier()
cv_score = cross_val_score(model_dt, x_train, y_train, cv=5)
print(cv_score.mean())
result['RF']=cv_score.mean()

#svm
model_dt = SVC()
cv_score = cross_val_score(model_dt, x_train_s, y_train,cv=5)
print(cv_score.mean())
result['svm'] = cv_score.mean()

#XGB
model_dt = XGBClassifier(random_state=1)
cv_score = cross_val_score(model_dt, x_train, y_train, cv=5)
print(cv_score.mean())
result['XGB'] = cv_score.mean()

#LGBM
model_dt = LGBMClassifier(random_state=1)
cv_score = cross_val_score(model_dt,x_train,y_train,cv=5)
print(cv_score.mean())
result['LGBM']=cv_score.mean()



#성능비교
print('=' * 40)
for m_name, score in result.items():
    print(m_name, score.round(3))
print('=' * 40)

#성능비교 시각화
result = pd.DataFrame(result.items(), columns=['key', 'value'])
result = result.sort_values('value')
ax = sns.barplot(x='key',y='value',data=result,order=result['key'],palette='pastel')
for i, v in enumerate(result['value']):
    ax.text(i, v, f'{v:.3f}', ha='center', va='bottom')
plt.show()



#최적화(하이퍼 파라미터 튜닝, 변수중요도 선택)
model = XGBClassifier(random_state=1)
# 파라미터 지정
# max_depth: range(1, 21)
param = {'learning_rate': [0.01, 0.1, 0.3], 'max_depth': range(1, 21)}
# 모델 선언
model = GridSearchCV(model, param, cv=5, scoring='accuracy', verbose=2)
model.fit(x_train,y_train)
print(model.best_params_)
#print(model.best_estimator_)
print(model.best_score_)
model.best_estimator_.feature_importances_

list(x)
plt.figure(figsize=(5, 5))
plt.barh(y=list(x), width=model.best_estimator_.feature_importances_,)
plt.show()


#성능평가
model = XGBClassifier(random_state=1,max_depth=1,learning_rate=0.3)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

#클래스 불균형 해결(recall 값이 현저히 낮을때) 

#oversampling
from imblearn.over_sampling import RandomOverSampler

# Over Sampling
over_sample = RandomOverSampler()
o_x_train, o_y_train = over_sample.fit_resample(x_train, y_train)

# 확인
print('전:', np.bincount(y_train))
print('후:', np.bincount(o_y_train))

model = XGBClassifier(max_depth=1, random_state=1,learning_rate=0.3)
model.fit(o_x_train, o_y_train)
y_pred = model.predict(x_test)

# 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


#undersampling
from imblearn.under_sampling import RandomUnderSampler

# Under Sampling
under_sample = RandomUnderSampler()
u_x_train, u_y_train = under_sample.fit_resample(x_train, y_train)

# 확인
print('전:', np.bincount(y_train))
print('후:', np.bincount(u_y_train))

model = XGBClassifier(max_depth=5, random_state=1)
model.fit(u_x_train, u_y_train)
y_pred = model.predict(x_test)
# 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 추가 변수 중요도 선택으로 성능 높이기
importance_sort = pd.DataFrame()
importance_sort['feature_name'] = x.columns
importance_sort['feature_importance'] = model.feature_importances_
importance_sort = importance_sort.sort_values(by='feature_importance',ascending=False)
importance_sort.reset_index(drop=True, inplace=True)
importance_sort

# 시각화
ax = sns.barplot(x='feature_name',y='feature_importance',data=importance_sort,order=importance_sort['feature_name'],palette='pastel')
for i, v in enumerate(importance_sort['feature_importance']):
    ax.text(i, v, f'{v:.3f}', ha='center', va='bottom')
plt.xticks(rotation=90)
plt.show()

ax = sns.barplot(x='feature_name',y='feature_importance',data=importance_sort,order=importance_sort['feature_name'],palette='pastel')
for i, v in enumerate(importance_sort['feature_importance']):
    ax.text(i, v, f'{v:.3f}', ha='center', va='bottom')
plt.xticks(rotation=90)
plt.show()

acc.plot(figsize=(20,5),marker='o')
plt.xlabel('train_features')
plt.ylabel('accuracy')
plt.grid()
plt.show()

acc = acc.sort_values(by='accuracy_score',ascending=False)
acc
importance_top = importance_sort['feature_name'][:acc.index[0]+1]
x_train_top = o_x_train[importance_top]
x_test_top = x_test[importance_top]
xgb_top_model = XGBClassifier(max_depth=1,
                             learning_rate=0.3,
                            random_state=1)
xgb_top_model.fit(x_train_top, o_y_train)
xgb_top_pred = xgb_top_model.predict(x_test_top)
print(accuracy_score(y_test,xgb_top_pred))
print(confusion_matrix(y_test,xgb_top_pred))
print(classification_report(y_test,xgb_top_pred))


# 앙상블
# Stacking
# 선언하기
estimators = [('dt', DecisionTreeClassifier()),
              ('knn', make_pipeline(MinMaxScaler(), KNeighborsClassifier())), #knn은 정규화가 필요하니까 두개를 연결함 근데 위에서 했잖아? 흐음 아 여러개를 한번에 앙상블하니까 위랑은 별개구나
              ('lr', LogisticRegression(max_iter=1000)),
              ('lgb', LGBMClassifier())]

model = StackingClassifier(estimators=estimators,
                           final_estimator= RandomForestClassifier())

# 학습하기
model.fit(x_train, y_train)

# 예측하기
y_pred = model.predict(x_test)

# 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 성능정보 수집


# voting
# 선언하기
estimators = [('lr', LinearRegression()),
              ('dt', DecisionTreeRegressor()),
              ('knn', make_pipeline(MinMaxScaler(), KNeighborsRegressor())),
              ('rdf', RandomForestRegressor()),
              ('lgb', LGBMRegressor())]

model = VotingRegressor(estimators=estimators)

# 학습하기
model.fit(x_train, y_train)

# 예측하기
y_pred = model.predict(x_test)

# 평가하기
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

# 성능정보 수집
result['Voting'] = r2_score(y_test, y_pred)