1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| # import modules & files import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
pd.set_option('display.max_columns', 30)
movie_train = pd.read_csv('./movies_train.csv', encoding = 'utf8')
movie_train.head(2)
movie_train.info()
sns.heatmap(data = movie_train.corr(), annot=True)
movie_train.corr()[movie_train.corr() >= 0.3]
########## Figuring out missing value movie_train[movie_train.dir_prev_bfnum.isnull()] movie_train.dir_prev_bfnum.isnull().sum() movie_train.isnull().sum()[movie_train.isnull().sum() <= len(movie_train) * 0.7]
movie_train.director.value_counts()
idx_dir_bfn = [n for n in movie_train['director'].values] null_idx_dir_bfn = [movie_train[['director', 'dir_prev_bfnum']][movie_train['dir_prev_bfnum'].isnull()].values[m][0] for m in range(movie_train['dir_prev_bfnum'].isnull().sum())] nn_idx_dir_bfn = [movie_train[['director', 'dir_prev_bfnum']][movie_train['dir_prev_bfnum'].notnull()].values[m][0] for m in range(movie_train['dir_prev_bfnum'].notnull().sum())]
len(movie_train['director'][movie_train['dir_prev_bfnum'].isnull()]) len(set(idx_dir_bfn)) len(movie_train['director'][movie_train['dir_prev_bfnum'].notnull()].unique())
dir_nm = [] for j in nn_idx_dir_bfn: for i in null_idx_dir_bfn: if i not in j: pass
elif i in j: dir_nm.append(i)
dir_nm = set(dir_nm)
bf_num = [movie_train['dir_prev_bfnum'][movie_train.director == n].values for n in dir_nm] df = pd.DataFrame(bf_num) df.drop(columns = [4, 5, 6], inplace=True, axis=1) df['dir_nm'] = dir_nm df
df.fillna(0, inplace=True) zero_cnt = [list(df.iloc[i, 0:4].values).count(0) for i in range(0,len(df))]
tot_bfnum = [] df_mean = [] for i in range(0,len(df)): if list(df.iloc[i, 0:4].values).count(0) == 1: tot_bfnum = np.delete(df.iloc[i, 0:4].values, list(df.iloc[i, 0:4].values).index(0)) df_mean.append(np.mean(tot_bfnum))
elif list(df.iloc[i, 0:4].values).count(0) == 2: tot_bfnum = np.delete(df.iloc[i, 0:4].values, list(df.iloc[i, 0:4].values).index(0)) tot_bfnum = np.delete(tot_bfnum, list(tot_bfnum).index(0)) df_mean.append(np.mean(tot_bfnum))
elif list(df.iloc[i, 0:4].values).count(0) == 3: tot_bfnum = np.delete(df.iloc[i, 0:4].values, list(df.iloc[i, 0:4].values).index(0)) tot_bfnum = np.delete(tot_bfnum, list(tot_bfnum).index(0)) tot_bfnum = np.delete(tot_bfnum, list(tot_bfnum).index(0)) for i in list(tot_bfnum): df_mean.append(i)
df['mean_bfnum'] = df_mean
dir_bfnum_dict = {nm : num for nm, num in zip(df['dir_nm'], df['mean_bfnum'])}
mt = pd.merge(movie_train, df[['dir_nm', 'mean_bfnum']], how = 'outer', left_on = 'director', right_on = 'dir_nm')
for n in dir_nm: movie_train['dir_prev_bfnum'][(movie_train.director == n) & (movie_train['dir_prev_bfnum'].isnull())] = movie_train['dir_prev_bfnum'][(movie_train.director == n) & (movie_train['dir_prev_bfnum'].isnull())].apply(lambda x:dir_bfnum_dict.get(n))
movie_train.info() movie_train.fillna(0, inplace=True)
|