preprocessing code

Split columns in data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
cols = {'obj': [],
'cat': [],
'cont': []
}

def data_split():
file_name = 'final_data_v2.csv'
file_path = os.getcwd()+'/drive/MyDrive/Colab Notebooks/' #[:-16]는 본인 경로에 맞게 있어도 되고 없어도 됨.
df = pd.read_csv(file_path+file_name, encoding='utf-8')

# 데이터 유형별 분류하기
for dt_idx, dt_val in zip(df.dtypes.index, df.dtypes.values):
if 'category' in dt_idx:
df[['category1']] = LabelEncoder().fit_transform(df[['category_id_1']])
cols['cat'].append('category1')

if dt_val == 'object':
if ('id' in dt_idx) | ('time' in dt_idx) | ('name' in dt_idx) | ('keyword' in dt_idx) |('url' in dt_idx):
df.drop(columns = dt_idx, axis=1, inplace=True)
else:
cols['obj'].append(dt_idx)

else:
if ('id' in dt_idx) | ('time' in dt_idx):
df.drop(columns = dt_idx, axis=1, inplace=True)
else:
if len(df[dt_idx].value_counts()) <= 30: #연속형 데이터 중 30개 내의 범주로 나눌 수 있는 데이터 = category로 구분.
cols['cat'].append(dt_idx)
else:
if ('hour' in dt_idx) | ('group' in dt_idx):
pass
else:
cols['cont'].append(dt_idx)

return cols

# using splited columns, make new data frame
def reorganization(df):
data = pd.DataFrame()
cols = data_split()
for k, v in cols.items():
for v in cols[k]:
if v in df.columns:
data[v] = df[c]
else:
pass

return data

# preprocessing data
def preprocessing(data, cols):
# 데이터 유형별 분류하기
data = reorganization(df)
modified_df = pd.DataFrame()
vec_dict = {idx: [] for idx in range(len(data.columns))}
feature_index = []

for i, c in enumerate(data.columns):
if c in cols['obj']:
obj_data = pd.get_dummies(data[c], prefix=c, prefix_sep = "/")
modified_df = pd.concat([modified_df, obj_data], axis=1)
vec_dict[i] = list(obj_data.columns)
feature_index.extend(repeat(i, obj_data.shape[1]))

elif c in cols['cat']: # click_label 컬럼 = y 변수로 사용
if 'click' in c:
pass
else:
cat_data = pd.get_dummies(data[c], prefix=c, prefix_sep = "/")
vec_dict[i] = list(cat_data.columns)
feature_index.extend(repeat(i, cat_data.shape[1]))
modified_df = pd.concat([modified_df, cat_data], axis=1)
else:
scaled_num_data = MinMaxScaler().fit_transform(df[v])
scaled_num_data = pd.DataFrame(scaled_num_data, columns = v)
modified_df[v] = scaled_num_data
vec_dict[i] = list(scaled_num_data.columns)
feature_index.extend(repeat(i, scaled_num_data.shape[1]))

print('---- Data info ----')
print(cols)
print('Data Frame shape: {}'.format(data.shape))
print('# of Feature: {}'.format(len(feature_index)))
print(f'# of Field: {len(vec_dict)}')
print(f'Modified DF columns: {data.columns}')
return vec_dict, feature_index, modified_df