1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
| import pandas as pd pd.set_option('display.max_columns', None) X_test = pd.read_csv("data/X_test.csv") X_train = pd.read_csv("data/X_train.csv") y_train = pd.read_csv("data/y_train.csv")
# 사용자 코딩 ## 결측값 확인하기 # print(X_test.isnull().sum()/len(X_test)) #1611개 결측값 존재 # print(X_train.isnull().sum()) #2295개 결측값 존재 # print(y_train.isnull().sum()) #0개 결측값 존재
## 결측값 처리하기 # print(X_test.describe()['환불금액']) # print(X_train.describe()['환불금액']) # print(X_test.corr()['환불금액']) # print(X_train.corr()['환불금액'])
# X_test.fillna(0, inplace=True) # X_train.fillna(0, inplace=True) # print(X_test.describe()['환불금액']) # print(X_train.describe()['환불금액']) from sklearn.impute import KNNImputer X_test['환불금액'] = KNNImputer(n_neighbors=5).fit_transform(X_test[['환불금액']]) X_train['환불금액'] =KNNImputer(n_neighbors=5).fit_transform(X_train[['환불금액']])
# print(X_test.isnull().sum()) # print(X_train.isnull().sum()) # print(X_test.describe()['환불금액']) # print(X_train.describe()['환불금액'])
## 데이터 유형 확인하기 # print(X_test.info()) # print(X_train.info()) # print(y_train.info())
## EDA -- object형 데이터 스케일링 # print(X_train['주구매상품'].value_counts()) ## 라벨 인코딩 # print(X_train['주구매지점'].value_counts()) ## 라벨 인코딩
from sklearn.preprocessing import LabelEncoder X_train['주구매상품'] = LabelEncoder().fit_transform(X_train['주구매상품']) X_train['주구매지점'] = LabelEncoder().fit_transform(X_train['주구매지점']) X_test['주구매상품'] = LabelEncoder().fit_transform(X_test['주구매상품']) X_test['주구매지점'] = LabelEncoder().fit_transform(X_test['주구매지점'])
# print(X_test.describe()) # print(X_train.describe())
## 총구매액, 최대구매액, 환불금액, 내점일수, 구매주기 => MinMax scaler from sklearn.preprocessing import MinMaxScaler X_test[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']] = MinMaxScaler().fit_transform(X_test[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']]) X_train[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']] = MinMaxScaler().fit_transform(X_train[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']])
# print(X_test.describe()) # print(X_train.describe())
## train 데이터 셋에서 훈련, 검증 데이터 분리하기 # print(2482/3500) #70.9% y_train = y_train['gender'] from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.3, shuffle=True, random_state = 2021)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
## 분류 모델 사용하기 (앙상블 - 에이다, gradient boost, random_forest) from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier
abc = AdaBoostClassifier(random_state = 2021).fit(X_train, y_train) gbc = GradientBoostingClassifier(random_state = 2021).fit(X_train, y_train) rfc = RandomForestClassifier(random_state = 2021).fit(X_train, y_train)
## 검증 데이터 성능 확인하기 # print(abc.score(X_val, y_val)) # print(gbc.score(X_val, y_val)) # print(rfc.score(X_val, y_val))
## 예측 데이터 pred_abc = abc.predict(X_val) pred_gbc = abc.predict(X_val) pred_rfc = abc.predict(X_val)
from sklearn.metrics import classification_report, roc_auc_score # help(classification_report) # print(classification_report(y_val, pred_abc, digits=2)) # print(classification_report(y_val, pred_gbc, digits=2)) # print(classification_report(y_val, pred_rfc, digits=2))
prob_abc = abc.predict_proba(X_val)[:, 1] prob_gbc = gbc.predict_proba(X_val)[:, 1] prob_rfc = rfc.predict_proba(X_val)[:, 1]
# print(roc_auc_score(y_val, prob_abc)) #0.649317286036036 print(roc_auc_score(y_val, prob_gbc)) #0.6513388388388388 # print(roc_auc_score(y_val, prob_rfc)) #0.6418703860110111
##### GradientBoosting 모델로 답안 pred = gbc.predict_proba(X_test)[:, 1]
# 답안 제출 참고 # 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용 pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False, encoding='utf-8')
|