Random Forest¶
ⅰ. 모듈 & DATA 특성 확인¶
In [1]:
import numpy as pn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
member = pd.read_csv("Data/member.csv")
trans = pd.read_csv("Data/transaction.csv")
member.head()
Out[2]:
id | recency | zip_code | is_referral | channel | conversion | |
---|---|---|---|---|---|---|
0 | 906145 | 10 | Surburban | 0 | Phone | 0 |
1 | 184478 | 6 | Rural | 1 | Web | 0 |
2 | 394235 | 7 | Surburban | 1 | Web | 0 |
3 | 130152 | 9 | Rural | 1 | Web | 0 |
4 | 940352 | 2 | Urban | 0 | Web | 0 |
In [3]:
member.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 64000 entries, 0 to 63999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 64000 non-null int64 1 recency 64000 non-null int64 2 zip_code 64000 non-null object 3 is_referral 64000 non-null int64 4 channel 64000 non-null object 5 conversion 64000 non-null int64 dtypes: int64(4), object(2) memory usage: 2.9+ MB
In [4]:
member.describe()
Out[4]:
id | recency | is_referral | conversion | |
---|---|---|---|---|
count | 64000.000000 | 64000.000000 | 64000.000000 | 64000.000000 |
mean | 550694.137797 | 5.763734 | 0.502250 | 0.146781 |
std | 259105.689773 | 3.507592 | 0.499999 | 0.353890 |
min | 100001.000000 | 1.000000 | 0.000000 | 0.000000 |
25% | 326772.000000 | 2.000000 | 0.000000 | 0.000000 |
50% | 551300.000000 | 6.000000 | 1.000000 | 0.000000 |
75% | 774914.500000 | 9.000000 | 1.000000 | 0.000000 |
max | 999997.000000 | 12.000000 | 1.000000 | 1.000000 |
In [5]:
trans.head()
Out[5]:
id | num_item | total_amount | |
---|---|---|---|
0 | 906145 | 5 | 34000 |
1 | 906145 | 1 | 27000 |
2 | 906145 | 4 | 33000 |
3 | 184478 | 4 | 29000 |
4 | 394235 | 4 | 33000 |
In [6]:
trans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 196836 entries, 0 to 196835 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 196836 non-null int64 1 num_item 196836 non-null int64 2 total_amount 196836 non-null int64 dtypes: int64(3) memory usage: 4.5 MB
In [7]:
trans.describe()
Out[7]:
id | num_item | total_amount | |
---|---|---|---|
count | 196836.000000 | 196836.000000 | 196836.000000 |
mean | 550557.552932 | 3.078365 | 21837.102969 |
std | 259254.795613 | 1.478408 | 8218.005565 |
min | 100001.000000 | 1.000000 | 8000.000000 |
25% | 326719.000000 | 2.000000 | 15000.000000 |
50% | 550918.000000 | 3.000000 | 22000.000000 |
75% | 774916.000000 | 4.000000 | 29000.000000 |
max | 999997.000000 | 6.000000 | 38000.000000 |
ⅱ. Data Merging¶
- feature engineering
In [8]:
trans['avg_price'] = trans['total_amount'] / trans['num_item']
trans.head()
Out[8]:
id | num_item | total_amount | avg_price | |
---|---|---|---|---|
0 | 906145 | 5 | 34000 | 6800.0 |
1 | 906145 | 1 | 27000 | 27000.0 |
2 | 906145 | 4 | 33000 | 8250.0 |
3 | 184478 | 4 | 29000 | 7250.0 |
4 | 394235 | 4 | 33000 | 8250.0 |
In [9]:
trans_mean = trans.groupby('id').mean()
trans_cnt = trans['id'].value_counts()
trans_df = pd.concat([trans_mean, trans_cnt], axis=1)
trans_df
Out[9]:
num_item | total_amount | avg_price | id | |
---|---|---|---|---|
100001 | 3.500000 | 26000.000000 | 7500.000000 | 2 |
100008 | 5.000000 | 26000.000000 | 5200.000000 | 1 |
100032 | 2.666667 | 20666.666667 | 9366.666667 | 3 |
100036 | 3.000000 | 25800.000000 | 13273.333333 | 5 |
100070 | 3.250000 | 21250.000000 | 8537.500000 | 4 |
... | ... | ... | ... | ... |
999932 | 5.000000 | 32000.000000 | 6400.000000 | 1 |
999981 | 2.000000 | 22750.000000 | 12875.000000 | 4 |
999990 | 3.000000 | 28000.000000 | 10388.888889 | 3 |
999995 | 2.000000 | 27000.000000 | 13500.000000 | 1 |
999997 | 2.000000 | 13000.000000 | 6500.000000 | 1 |
64000 rows × 4 columns
In [10]:
trans_df.columns = ['num_item', 'total_amount', 'avg_price', 'count']
# trans_df.rename(columns={'id' : 'count'})
trans_df.head()
Out[10]:
num_item | total_amount | avg_price | count | |
---|---|---|---|---|
100001 | 3.500000 | 26000.000000 | 7500.000000 | 2 |
100008 | 5.000000 | 26000.000000 | 5200.000000 | 1 |
100032 | 2.666667 | 20666.666667 | 9366.666667 | 3 |
100036 | 3.000000 | 25800.000000 | 13273.333333 | 5 |
100070 | 3.250000 | 21250.000000 | 8537.500000 | 4 |
In [11]:
df = member.join(trans_df, on='id')
In [12]:
df
Out[12]:
id | recency | zip_code | is_referral | channel | conversion | num_item | total_amount | avg_price | count | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 906145 | 10 | Surburban | 0 | Phone | 0 | 3.333333 | 31333.333333 | 14016.666667 | 3 |
1 | 184478 | 6 | Rural | 1 | Web | 0 | 4.000000 | 29000.000000 | 7250.000000 | 1 |
2 | 394235 | 7 | Surburban | 1 | Web | 0 | 4.000000 | 20500.000000 | 5125.000000 | 2 |
3 | 130152 | 9 | Rural | 1 | Web | 0 | 1.750000 | 20750.000000 | 14875.000000 | 4 |
4 | 940352 | 2 | Urban | 0 | Web | 0 | 3.000000 | 31000.000000 | 10333.333333 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
63995 | 838295 | 10 | Urban | 0 | Web | 0 | 3.500000 | 26000.000000 | 8012.500000 | 4 |
63996 | 547316 | 5 | Urban | 1 | Phone | 0 | 1.800000 | 17800.000000 | 11300.000000 | 5 |
63997 | 131575 | 6 | Urban | 1 | Phone | 0 | 4.000000 | 30500.000000 | 7833.333333 | 2 |
63998 | 603659 | 1 | Surburban | 1 | Multichannel | 0 | 3.200000 | 21600.000000 | 7583.333333 | 5 |
63999 | 254229 | 1 | Surburban | 0 | Web | 0 | 3.400000 | 24400.000000 | 9280.000000 | 5 |
64000 rows × 10 columns
ⅲ. One-hot Encoding¶
In [13]:
df.isna().sum()
Out[13]:
id 0 recency 0 zip_code 0 is_referral 0 channel 0 conversion 0 num_item 0 total_amount 0 avg_price 0 count 0 dtype: int64
In [14]:
df.head()
Out[14]:
id | recency | zip_code | is_referral | channel | conversion | num_item | total_amount | avg_price | count | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 906145 | 10 | Surburban | 0 | Phone | 0 | 3.333333 | 31333.333333 | 14016.666667 | 3 |
1 | 184478 | 6 | Rural | 1 | Web | 0 | 4.000000 | 29000.000000 | 7250.000000 | 1 |
2 | 394235 | 7 | Surburban | 1 | Web | 0 | 4.000000 | 20500.000000 | 5125.000000 | 2 |
3 | 130152 | 9 | Rural | 1 | Web | 0 | 1.750000 | 20750.000000 | 14875.000000 | 4 |
4 | 940352 | 2 | Urban | 0 | Web | 0 | 3.000000 | 31000.000000 | 10333.333333 | 1 |
In [15]:
print(df['zip_code'].unique(), df['zip_code'].nunique())
print(df['channel'].unique(), df['channel'].nunique())
['Surburban' 'Rural' 'Urban'] 3 ['Phone' 'Web' 'Multichannel'] 3
In [16]:
df = pd.get_dummies(df, columns=['zip_code', 'channel'], drop_first=True)
ⅳ. Modeling¶
In [17]:
from sklearn.model_selection import train_test_split
X = df.drop(['id', 'conversion'], axis=1)
y = df['conversion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)
In [18]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth = 10, random_state = 1234)
model.fit(X_train, y_train)
Out[18]:
RandomForestClassifier(max_depth=10, random_state=1234)
In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
pred = model.predict(X_test)
print(accuracy_score(y_test, pred))
confusion_matrix(y_test, pred)
0.8713541666666667
Out[19]:
array([[16348, 74], [ 2396, 382]], dtype=int64)
In [20]:
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.87 1.00 0.93 16422 1 0.84 0.14 0.24 2778 accuracy 0.87 19200 macro avg 0.85 0.57 0.58 19200 weighted avg 0.87 0.87 0.83 19200
ⅴ. Tree Regressor¶
- 기본적으로 이진분류모델인 Random Forest를 연속적인 값도 예측이 가능하게 함
In [21]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth = 10, random_state = 1234)
rf.fit(X_train, y_train)
Out[21]:
RandomForestRegressor(max_depth=10, random_state=1234)
In [22]:
pred = rf.predict(X_test)
pd.DataFrame([pred, y_test])
# pred의 값은 y_test와는 다르게 0과 1의 형태가 아님
# 주관적인 기준으로 pred의 값을 이진분류형태로 치환
Out[22]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19190 | 19191 | 19192 | 19193 | 19194 | 19195 | 19196 | 19197 | 19198 | 19199 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.130254 | 0.081264 | 0.121347 | 0.202229 | 0.212367 | 0.136106 | 0.115631 | 0.0 | 0.137646 | 0.151976 | ... | 0.058867 | 0.63356 | 0.086244 | 0.145602 | 0.0 | 0.113222 | 0.060416 | 0.132652 | 0.312275 | 0.0 |
1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | ... | 0.000000 | 1.00000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 |
2 rows × 19200 columns
In [23]:
# pred의 값이 0.5 이상이면 1 반환
res1 = pd.Series(pred).apply(lambda x: 1 if x >= 0.5 else 0)
res1
Out[23]:
0 0 1 0 2 0 3 0 4 0 .. 19195 0 19196 0 19197 0 19198 0 19199 0 Length: 19200, dtype: int64
In [24]:
print(accuracy_score(y_test, res1))
print(confusion_matrix(y_test, res1))
print(classification_report(y_test, res1))
0.8759375 [[16288 134] [ 2248 530]] precision recall f1-score support 0 0.88 0.99 0.93 16422 1 0.80 0.19 0.31 2778 accuracy 0.88 19200 macro avg 0.84 0.59 0.62 19200 weighted avg 0.87 0.88 0.84 19200
In [25]:
# pred의 값이 0.3 이상이면 1 반환
res2 = pd.Series(pred).apply(lambda x: 1 if x >= 0.3 else 0)
res2
Out[25]:
0 0 1 0 2 0 3 0 4 0 .. 19195 0 19196 0 19197 0 19198 1 19199 0 Length: 19200, dtype: int64
In [26]:
print(accuracy_score(y_test, res2))
print(confusion_matrix(y_test, res2))
print(classification_report(y_test, res2))
0.8653125 [[15729 693] [ 1893 885]] precision recall f1-score support 0 0.89 0.96 0.92 16422 1 0.56 0.32 0.41 2778 accuracy 0.87 19200 macro avg 0.73 0.64 0.67 19200 weighted avg 0.84 0.87 0.85 19200
ⅵ. 변수 중요도 파악¶
In [27]:
X_train.columns
Out[27]:
Index(['recency', 'is_referral', 'num_item', 'total_amount', 'avg_price', 'count', 'zip_code_Surburban', 'zip_code_Urban', 'channel_Phone', 'channel_Web'], dtype='object')
In [28]:
rf.feature_importances_
Out[28]:
array([0.04335594, 0.01528215, 0.38190484, 0.14378719, 0.23511524, 0.1636367 , 0.00359206, 0.0035514 , 0.00596767, 0.00380682])
In [29]:
plt.figure(figsize=(20, 10))
sns.barplot(x=X_train.columns, y=rf.feature_importances_)
Out[29]:
<AxesSubplot:>
'데이터 분석 > Proj. E-Commerce' 카테고리의 다른 글
E-Commerce Part Ⅷ: 자연어분석(NLP) 응용 (0) | 2021.01.18 |
---|---|
E-Commerce Part Ⅶ: 시계열 분석 응용 (0) | 2021.01.18 |
E-Commerce Part Ⅵ: K Means Clustering 응용 (0) | 2021.01.16 |
E-Commerce Part Ⅳ: Decision Tree 모델 응용 (0) | 2021.01.16 |
E-Commerce Part Ⅲ: KNN 모델 응용 (0) | 2021.01.15 |
E-Commerce Part Ⅱ: 로지스틱회귀분석 응용 (0) | 2021.01.15 |
E-Commerce Part Ⅰ: 선형회귀분석 응용 (0) | 2021.01.12 |