Bigdata Machine Learning example

Machine Learning Example

한국데이터산업진흥원(KDATA/dataq.or.kr)에서 제시해준 예시 문제 답안 만들어봤다. 머신러닝 연습하면서 파이썬 코드 작성했다. (분류문제)

xgboost, lightgbm은 시험환경에서 먹히지 않아서 패쓰…

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
pd.set_option('display.max_columns', None)
X_test = pd.read_csv("data/X_test.csv")
X_train = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")

# 사용자 코딩
## 결측값 확인하기
# print(X_test.isnull().sum()/len(X_test)) #1611개 결측값 존재
# print(X_train.isnull().sum()) #2295개 결측값 존재
# print(y_train.isnull().sum()) #0개 결측값 존재

## 결측값 처리하기
# print(X_test.describe()['환불금액'])
# print(X_train.describe()['환불금액'])
# print(X_test.corr()['환불금액'])
# print(X_train.corr()['환불금액'])

# X_test.fillna(0, inplace=True)
# X_train.fillna(0, inplace=True)
# print(X_test.describe()['환불금액'])
# print(X_train.describe()['환불금액'])
from sklearn.impute import KNNImputer
X_test['환불금액'] = KNNImputer(n_neighbors=5).fit_transform(X_test[['환불금액']])
X_train['환불금액'] =KNNImputer(n_neighbors=5).fit_transform(X_train[['환불금액']])

# print(X_test.isnull().sum())
# print(X_train.isnull().sum())
# print(X_test.describe()['환불금액'])
# print(X_train.describe()['환불금액'])

## 데이터 유형 확인하기
# print(X_test.info())
# print(X_train.info())
# print(y_train.info())

## EDA -- object형 데이터 스케일링
# print(X_train['주구매상품'].value_counts()) ## 라벨 인코딩
# print(X_train['주구매지점'].value_counts()) ## 라벨 인코딩

from sklearn.preprocessing import LabelEncoder
X_train['주구매상품'] = LabelEncoder().fit_transform(X_train['주구매상품'])
X_train['주구매지점'] = LabelEncoder().fit_transform(X_train['주구매지점'])
X_test['주구매상품'] = LabelEncoder().fit_transform(X_test['주구매상품'])
X_test['주구매지점'] = LabelEncoder().fit_transform(X_test['주구매지점'])

# print(X_test.describe())
# print(X_train.describe())

## 총구매액, 최대구매액, 환불금액, 내점일수, 구매주기 => MinMax scaler
from sklearn.preprocessing import MinMaxScaler
X_test[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']] = MinMaxScaler().fit_transform(X_test[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']])
X_train[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']] = MinMaxScaler().fit_transform(X_train[['총구매액', '최대구매액', '환불금액', '내점일수', '구매주기']])

# print(X_test.describe())
# print(X_train.describe())

## train 데이터 셋에서 훈련, 검증 데이터 분리하기
# print(2482/3500) #70.9%
y_train = y_train['gender']
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.3, shuffle=True, random_state = 2021)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

## 분류 모델 사용하기 (앙상블 - 에이다, gradient boost, random_forest)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

abc = AdaBoostClassifier(random_state = 2021).fit(X_train, y_train)
gbc = GradientBoostingClassifier(random_state = 2021).fit(X_train, y_train)
rfc = RandomForestClassifier(random_state = 2021).fit(X_train, y_train)

## 검증 데이터 성능 확인하기
# print(abc.score(X_val, y_val))
# print(gbc.score(X_val, y_val))
# print(rfc.score(X_val, y_val))

## 예측 데이터
pred_abc = abc.predict(X_val)
pred_gbc = abc.predict(X_val)
pred_rfc = abc.predict(X_val)

from sklearn.metrics import classification_report, roc_auc_score
# help(classification_report)
# print(classification_report(y_val, pred_abc, digits=2))
# print(classification_report(y_val, pred_gbc, digits=2))
# print(classification_report(y_val, pred_rfc, digits=2))

prob_abc = abc.predict_proba(X_val)[:, 1]
prob_gbc = gbc.predict_proba(X_val)[:, 1]
prob_rfc = rfc.predict_proba(X_val)[:, 1]

# print(roc_auc_score(y_val, prob_abc)) #0.649317286036036
print(roc_auc_score(y_val, prob_gbc)) #0.6513388388388388
# print(roc_auc_score(y_val, prob_rfc)) #0.6418703860110111

##### GradientBoosting 모델로 답안
pred = gbc.predict_proba(X_test)[:, 1]

# 답안 제출 참고
# 아래 코드 예측변수와 수험번호를 개인별로 변경하여 활용
pd.DataFrame({'cust_id': X_test.cust_id, 'gender': pred}).to_csv('003000000.csv', index=False, encoding='utf-8')