preprocessing code for DeepFM modeling

  • The following code is based on python:

    • This code is for preprocessing code of DeepFM modeling to predict CTR.
    • X feature of Deep FM model = various fields (nominal, continuous variables) with n-dimensional vector
    • Y feature of Deep FM model = 0, 1 labels (whether the user clicked or not)
    • nominal field => one-hot encoding vector
    • continuous field => the value as it is, or discretization (change into categorical data) then one-hot vectorization
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# preprocessing code
cols = {'obj': [],
'cat': [],
'cont': []
}
file_name = 'final_data.csv'
file_path = os.getcwd()[:-16]+'data\\' #[:-16]는 본인 경로에 맞게 있어도 되고 없어도 됨.
df = pd.read_csv(file_path+file_name, encoding='utf-8')

def preprocessing(df):
# 데이터 유형별 분류하기
for dt_idx, dt_val in zip(df.dtypes.index, df.dtypes.values):
if 'category' in dt_idx:
df[['goods_cat']] = LabelEncoder().fit_transform(df[['category_id_1']])

if dt_val == 'object':
if ('id' in dt_idx) | ('time' in dt_idx) | ('name' in dt_idx) | ('keyword' in dt_idx) |('url' in dt_idx):
df.drop(columns = dt_idx, axis=1, inplace=True)
else:
cols['obj'].append(dt_idx)

else:
if ('id' in dt_idx) | ('time' in dt_idx):
df.drop(columns = dt_idx, axis=1, inplace=True)
else:
if len(df[dt_idx].value_counts()) <= 5: #연속형 데이터 중 5개 내의 범주로 나눌 수 있는 데이터 = category로 구분.
cols['cat'].append(dt_idx)
else:
if ('hour' in dt_idx) | ('group' in dt_idx):
pass
else:
cols['cont'].append(dt_idx)

for k, v in cols.items(): # 컬럼 전처리 (스케일링, 원핫인코딩)
if k == 'obj':
obj_data = pd.get_dummies(df[v], drop_first = True)
df = pd.concat([df, obj_data], axis=1).drop(columns = v, axis=1)


elif k == 'cont':
num_data = RobustScaler().fit_transform(df[v])
num_data = pd.DataFrame(num_data, columns = v)
df[v] = num_data

print('---- Data info ----')
print('X shape: {}'.format(df.shape))
print('# of Feature: {}'.format(len(df.columns)))
return df

df = preprocessing(df)
df.head(3)