1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
| cols = {'obj': [], 'cat': [], 'cont': [] }
def data_split(): file_name = 'final_data_v2.csv' file_path = os.getcwd()+'/drive/MyDrive/Colab Notebooks/' #[:-16]는 본인 경로에 맞게 있어도 되고 없어도 됨. df = pd.read_csv(file_path+file_name, encoding='utf-8')
# 데이터 유형별 분류하기 for dt_idx, dt_val in zip(df.dtypes.index, df.dtypes.values): if 'category' in dt_idx: df[['category1']] = LabelEncoder().fit_transform(df[['category_id_1']]) cols['cat'].append('category1')
if dt_val == 'object': if ('id' in dt_idx) | ('time' in dt_idx) | ('name' in dt_idx) | ('keyword' in dt_idx) |('url' in dt_idx): df.drop(columns = dt_idx, axis=1, inplace=True) else: cols['obj'].append(dt_idx)
else: if ('id' in dt_idx) | ('time' in dt_idx): df.drop(columns = dt_idx, axis=1, inplace=True) else: if len(df[dt_idx].value_counts()) <= 30: #연속형 데이터 중 30개 내의 범주로 나눌 수 있는 데이터 = category로 구분. cols['cat'].append(dt_idx) else: if ('hour' in dt_idx) | ('group' in dt_idx): pass else: cols['cont'].append(dt_idx)
return cols
# using splited columns, make new data frame def reorganization(df): data = pd.DataFrame() cols = data_split() for k, v in cols.items(): for v in cols[k]: if v in df.columns: data[v] = df[c] else: pass
return data
# preprocessing data def preprocessing(data, cols): # 데이터 유형별 분류하기 data = reorganization(df) modified_df = pd.DataFrame() vec_dict = {idx: [] for idx in range(len(data.columns))} feature_index = []
for i, c in enumerate(data.columns): if c in cols['obj']: obj_data = pd.get_dummies(data[c], prefix=c, prefix_sep = "/") modified_df = pd.concat([modified_df, obj_data], axis=1) vec_dict[i] = list(obj_data.columns) feature_index.extend(repeat(i, obj_data.shape[1]))
elif c in cols['cat']: # click_label 컬럼 = y 변수로 사용 if 'click' in c: pass else: cat_data = pd.get_dummies(data[c], prefix=c, prefix_sep = "/") vec_dict[i] = list(cat_data.columns) feature_index.extend(repeat(i, cat_data.shape[1])) modified_df = pd.concat([modified_df, cat_data], axis=1) else: scaled_num_data = MinMaxScaler().fit_transform(df[v]) scaled_num_data = pd.DataFrame(scaled_num_data, columns = v) modified_df[v] = scaled_num_data vec_dict[i] = list(scaled_num_data.columns) feature_index.extend(repeat(i, scaled_num_data.shape[1]))
print('---- Data info ----') print(cols) print('Data Frame shape: {}'.format(data.shape)) print('# of Feature: {}'.format(len(feature_index))) print(f'# of Field: {len(vec_dict)}') print(f'Modified DF columns: {data.columns}') return vec_dict, feature_index, modified_df
|