Age 0.024471
CCAvg 0.098468
CD Account 4.372577
CreditCard -1.237447
Education 1.520329
Experience -0.007032
Family 0.757911
Income 0.054695
Mortgage -0.000133
Online -0.440746
Securities Account -1.852006
const -13.920298
dtype: float64
1
2
3
4
5
6
## 나이가 한살 많을수록록 대출할 확률이 1.024 높다.## 수입이 1단위 높을소룩 대출할 확률이 1.05배 높다 ## 가족 구성원수가 1많을수록 대출할 확률이 2.13배 높다## 경력이 1단위 높을수록 대출할 확률이 0.99배 높다(귀무가설 채택)# Experience, Mortgage는 제외할 필요성이 있어보임np.exp(results.params)
Age 1.024773e+00
CCAvg 1.103479e+00
CD Account 7.924761e+01
CreditCard 2.901239e-01
Education 4.573729e+00
Experience 9.929928e-01
Family 2.133814e+00
Income 1.056218e+00
Mortgage 9.998665e-01
Online 6.435563e-01
Securities Account 1.569221e-01
const 9.005163e-07
dtype: float64
1
pred_y=results.predict(test_x)
1
2
3
4
5
6
7
defcut_off(y,threshold):Y=y.copy()# copy함수를 사용하여 이전의 y값이 변화지 않게 함Y[Y>threshold]=1Y[Y<=threshold]=0return(Y.astype(int))pred_Y=cut_off(pred_y,0.5)
defprocessSubset(X,y,feature_set):model=sm.Logit(y,X[list(feature_set)])regr=model.fit()AIC=regr.aicreturn{"model":regr,"AIC":AIC}'''
전진선택법
'''defforward(X,y,predictors):# 데이터 변수들이 미리정의된 predictors에 있는지 없는지 확인 및 분류remaining_predictors=[pforpinX.columns.difference(['const'])ifpnotinpredictors]tic=time.time()results=[]forpinremaining_predictors:results.append(processSubset(X=X,y=y,feature_set=predictors+[p]+['const']))# 데이터프레임으로 변환models=pd.DataFrame(results)# AIC가 가장 낮은 것을 선택best_model=models.loc[models['AIC'].argmin()]# indextoc=time.time()print("Processed ",models.shape[0],"models on",len(predictors)+1,"predictors in",(toc-tic))print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0])returnbest_modeldefforward_model(X,y):Fmodels=pd.DataFrame(columns=["AIC","model"])tic=time.time()# 미리 정의된 데이터 변수predictors=[]# 변수 1~10개 : 0~9 -> 1~10foriinrange(1,len(X.columns.difference(['const']))+1):Forward_result=forward(X=X,y=y,predictors=predictors)ifi>1:ifForward_result['AIC']>Fmodel_before:breakFmodels.loc[i]=Forward_resultpredictors=Fmodels.loc[i]["model"].model.exog_namesFmodel_before=Fmodels.loc[i]["AIC"]predictors=[kforkinpredictorsifk!='const']toc=time.time()print("Total elapsed time:",(toc-tic),"seconds.")return(Fmodels['model'][len(Fmodels['model'])])'''
후진소거법
'''defbackward(X,y,predictors):tic=time.time()results=[]# 데이터 변수들이 미리정의된 predictors 조합 확인forcomboinitertools.combinations(predictors,len(predictors)-1):results.append(processSubset(X=X,y=y,feature_set=list(combo)+['const']))models=pd.DataFrame(results)# 가장 낮은 AIC를 가진 모델을 선택best_model=models.loc[models['AIC'].argmin()]toc=time.time()print("Processed ",models.shape[0],"models on",len(predictors)-1,"predictors in",(toc-tic))print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0])returnbest_modeldefbackward_model(X,y):Bmodels=pd.DataFrame(columns=["AIC","model"],index=range(1,len(X.columns)))tic=time.time()predictors=X.columns.difference(['const'])Bmodel_before=processSubset(X,y,predictors)['AIC']while(len(predictors)>1):Backward_result=backward(X=train_x,y=train_y,predictors=predictors)ifBackward_result['AIC']>Bmodel_before:breakBmodels.loc[len(predictors)-1]=Backward_resultpredictors=Bmodels.loc[len(predictors)-1]["model"].model.exog_namesBmodel_before=Backward_result['AIC']predictors=[kforkinpredictorsifk!='const']toc=time.time()print("Total elapsed time:",(toc-tic),"seconds.")return(Bmodels['model'].dropna().iloc[0])'''
단계적 선택법
'''defStepwise_model(X,y):Stepmodels=pd.DataFrame(columns=["AIC","model"])tic=time.time()predictors=[]Smodel_before=processSubset(X,y,predictors+['const'])['AIC']# 변수 1~10개 : 0~9 -> 1~10foriinrange(1,len(X.columns.difference(['const']))+1):Forward_result=forward(X=X,y=y,predictors=predictors)# constant addedprint('forward')Stepmodels.loc[i]=Forward_resultpredictors=Stepmodels.loc[i]["model"].model.exog_namespredictors=[kforkinpredictorsifk!='const']Backward_result=backward(X=X,y=y,predictors=predictors)ifBackward_result['AIC']<Forward_result['AIC']:Stepmodels.loc[i]=Backward_resultpredictors=Stepmodels.loc[i]["model"].model.exog_namesSmodel_before=Stepmodels.loc[i]["AIC"]predictors=[kforkinpredictorsifk!='const']print('backward')ifStepmodels.loc[i]['AIC']>Smodel_before:breakelse:Smodel_before=Stepmodels.loc[i]["AIC"]toc=time.time()print("Total elapsed time:",(toc-tic),"seconds.")return(Stepmodels['model'][len(Stepmodels['model'])])
pred_y_full=results2.predict(test_x2)# full modelpred_y_forward=Forward_best_model.predict(test_x[Forward_best_model.model.exog_names])pred_y_backward=Backward_best_model.predict(test_x[Backward_best_model.model.exog_names])pred_y_stepwise=Stepwise_best_model.predict(test_x[Stepwise_best_model.model.exog_names])
data=[]acc_table=[]fori,ainenumerate(alpha):lasso=Lasso(alpha=a).fit(train_x,train_y)data.append(pd.Series(np.hstack([lasso.intercept_,lasso.coef_])))pred_y=lasso.predict(test_x)# full modelpred_y=cut_off(pred_y,0.5)cfmat=confusion_matrix(test_y,pred_y)acc_table.append((acc(cfmat)))df_lasso=pd.DataFrame(data,index=alpha).Tdf_lassoacc_table_lasso=pd.DataFrame(acc_table,index=alpha).T
1
df_lasso
0.001
0.01
0.1
1.0
10.0
0
0.932
0.936
0.894667
0.897333
0.897333
1
2
3
4
5
6
7
8
9
10
11
12
13
data=[]acc_table=[]fori,ainenumerate(alpha):ridge=Ridge(alpha=a).fit(train_x,train_y)data.append(pd.Series(np.hstack([ridge.intercept_,ridge.coef_])))pred_y=ridge.predict(test_x)# full modelpred_y=cut_off(pred_y,0.5)cfmat=confusion_matrix(test_y,pred_y)acc_table.append((acc(cfmat)))df_ridge=pd.DataFrame(data,index=alpha).Tacc_table_ridge=pd.DataFrame(acc_table,index=alpha).T