범주형 변수 처리

2021.3.27 2021.3.27 Data_Science/Data_Handling 392 2 mins

1
2
3
4


import pandas as pd
import os

df = pd.read_csv("car-good.csv")

1
2
3


# 특징과 라벨 분리
X = df.drop('Class', axis = 1)
Y = df['Class']

1
2
3


# 학습 데이터와 평가 데이터 분리
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y)

1
2
3


# 문자 라벨을 숫자로 치환 
Train_Y.replace({"negative":-1, "positive":1}, inplace = True)
Test_Y.replace({"negative":-1, "positive":1}, inplace = True)

1
2
3


# 자세한 범주형 변수 판별 => 모든 변수가 범주형임을 확인
for col in Train_X.columns:
    print(col, len(Train_X[col].unique()))

Buying 4
Maint 4
Doors 3
Persons 2
Lug_boot 3
Safety 3

더미화를 이용한 범주 변수 처리

1

Train_X = Train_X.astype(str) # 모든 변수가 범주이므로, 더미화를 위해 전부 string 타입으로 변환

1
2
3
4
5
6
7
8


from feature_engine.categorical_encoders import OneHotCategoricalEncoder as OHE
dummy_model = OHE(variables = Train_X.columns.tolist(),
                 drop_last = True)

dummy_model.fit(Train_X)

d_Train_X = dummy_model.transform(Train_X)
d_Test_X = dummy_model.transform(Test_X)

  res_values = method(rvalues)

1
2
3
4
5
6
7


# 더미화를 한 뒤의 모델 테스트
from sklearn.neighbors import KNeighborsClassifier as KNN
model = KNN().fit(d_Train_X, Train_Y)
pred_Y = model.predict(d_Test_X)

from sklearn.metrics import f1_score
f1_score(Test_Y, pred_Y)

0.0

연속형 변수로 치환

1
2
3
4
5


Train_df = pd.concat([Train_X, Train_Y], axis = 1)
for col in Train_X.columns: # 보통은 범주 변수만 순회
    temp_dict = Train_df.groupby(col)['Class'].mean().to_dict() # col에 따른 Class의 평균을 나타내는 사전 (replace를 쓰기 위해, 사전으로 만듦)
    Train_df[col] = Train_df[col].replace(temp_dict) # 변수 치환    
    Test_X[col] = Test_X[col].astype(str).replace(temp_dict) # 테스트 데이터도 같이 치환해줘야 함 (나중에 활용하기 위해서는 저장도 필요)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """

1

Train_df.head()

	Buying	Maint	Doors	Persons	Lug_boot	Safety	Class
810	-0.809524	-0.82716	-0.913462	-1.0	-0.921951	-1.000000	-1
471	-0.925466	-1.00000	-0.935185	-1.0	-0.926267	-1.000000	-1
381	-1.000000	-0.82716	-0.913462	-1.0	-0.926267	-1.000000	-1
80	-1.000000	-1.00000	-0.946429	-1.0	-0.946903	-0.869159	-1
637	-0.925466	-0.82716	-0.935185	-1.0	-0.946903	-0.924171	-1

1
2


Train_X = Train_df.drop('Class', axis = 1)
Train_Y = Train_df['Class']

1
2
3
4
5
6
7
8


# 치환한 뒤의 모델 테스트
model = KNN().fit(Train_X, Train_Y)
pred_Y = model.predict(Test_X)

f1_score(Test_Y, pred_Y)


# 라벨을 고려한 전처리이므로 더미화보다 좋은 결과가 나왔음 => 차원도 줄고 성능 상에 이점이 있으나, 

0.20000000000000004

Load Comments?