1
2
3
4
|
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
|
1
2
3
|
# 데이터 불러오기
data = pd.read_csv("./data/otto_train.csv") # Product Category
data.head() # 데이터 확인
|
|
id |
feat_1 |
feat_2 |
feat_3 |
feat_4 |
feat_5 |
feat_6 |
feat_7 |
feat_8 |
feat_9 |
... |
feat_85 |
feat_86 |
feat_87 |
feat_88 |
feat_89 |
feat_90 |
feat_91 |
feat_92 |
feat_93 |
target |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Class_1 |
1 |
2 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Class_1 |
2 |
3 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Class_1 |
3 |
4 |
1 |
0 |
0 |
1 |
6 |
1 |
5 |
0 |
0 |
... |
0 |
1 |
2 |
0 |
0 |
0 |
0 |
0 |
0 |
Class_1 |
4 |
5 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
Class_1 |
1
2
3
4
5
|
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''
|
'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'
1
2
3
|
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
print('nCar: %d' % nCar, 'nVar: %d' % nVar )
|
nCar: 61878 nVar: 95
1
|
data = data.drop(['id'], axis = 1) # id 제거
|
`타겟 변수의 문자열을 숫자로 변환
1
2
3
4
5
6
7
8
9
10
|
mapping_dict = {"Class_1": 1,
"Class_2": 2,
"Class_3": 3,
"Class_4": 4,
"Class_5": 5,
"Class_6": 6,
"Class_7": 7,
"Class_8": 8,
"Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])
|
설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리
1
2
3
4
5
|
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
|
(49502, 93) (12376, 93) (49502,) (12376,)
- 학습 데이터를 랜덤포레스트 모형에 적합 후 평가 데이터로 검증
1
2
3
4
5
6
7
8
|
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
random_forest_model1 = RandomForestClassifier(n_estimators = 20,
max_depth = 5,
random_state = 42)
model1 = random_forest_model1.fit(train_x, train_y)
predict1 = model1.predict(test_x)
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%")
|
Accuracy: 60.16 %
estimators 증가
1
2
3
4
5
6
|
random_forest_model2 = RandomForestClassifier(n_estimators = 300,
max_depth = 5,
random_state = 42)
model2 = random_forest_model2.fit(train_x, train_y)
predict2 = model2.predict(test_x)
print("Accuracy: %.2f" % (accuracy_score(test_y, predict2) * 100), "%")
|
Accuracy: 61.73 %
트리의 깊이
1
2
3
4
5
6
|
random_forest_model3 = RandomForestClassifier(n_estimators = 300,
max_depth = 20,
random_state = 42)
model3 = random_forest_model3.fit(train_x, train_y)
predict3 = model3.predict(test_x)
print("Accuracy: %.2f" % (accuracy_score(test_y, predict3) * 100), "%")
|
Accuracy: 78.09 %
트리의 깊이를 최대
1
2
3
4
5
6
|
random_forest_model4 = RandomForestClassifier(n_estimators = 300,
max_depth = 100,
random_state = 42)
model4 = random_forest_model4.fit(train_x, train_y)
predict4 = model4.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict4) * 100), "%")
|
Accuracy: 81.23 %