변수 선택법, 회귀 계수 축소

2021.3.27 2021.3.27 Data_Science/Machine_Learning 2504 12 mins

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import statsmodels.api as sm
import matplotlib.pyplot as plt
import itertools
import time
ploan = pd.read_csv("./data/Personal Loan.csv")

1
2


ploan_processed = ploan.dropna().drop(['ID','ZIP Code'], axis=1, inplace=False)
ploan_processed = sm.add_constant(ploan_processed, has_constant='add')

설명변수(X), 타켓변수(Y) 분리 및 학습데이터와 평가데이터

1
2
3


feature_columns = list(ploan_processed.columns.difference(["Personal Loan"]))
X = ploan_processed[feature_columns]
y = ploan_processed['Personal Loan'] # 대출여부: 1 or 0

1
2


train_x, test_x, train_y, test_y = train_test_split(X, y, stratify=y,train_size=0.7,test_size=0.3,random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1750, 12) (750, 12) (1750,) (750,)

로지스틱회귀모형 모델링 y = f(x)

1
2


model = sm.Logit(train_y, train_x)
results = model.fit(method='newton')

Optimization terminated successfully.
         Current function value: 0.131055
         Iterations 9

1

results.summary()

Logit Regression Results
Dep. Variable:	Personal Loan	No. Observations:	1750
Model:	Logit	Df Residuals:	1738
Method:	MLE	Df Model:	11
Date:	Fri, 23 Aug 2019	Pseudo R-squ.:	0.6030
Time:	14:58:19	Log-Likelihood:	-229.35
converged:	True	LL-Null:	-577.63
		LLR p-value:	2.927e-142

	coef	std err	z	P>\|z\|	[0.025	0.975]
Age	0.0245	0.102	0.240	0.810	-0.175	0.224
CCAvg	0.0985	0.063	1.562	0.118	-0.025	0.222
CD Account	4.3726	0.568	7.703	0.000	3.260	5.485
CreditCard	-1.2374	0.337	-3.667	0.000	-1.899	-0.576
Education	1.5203	0.190	7.999	0.000	1.148	1.893
Experience	-0.0070	0.102	-0.069	0.945	-0.206	0.192
Family	0.7579	0.128	5.914	0.000	0.507	1.009
Income	0.0547	0.004	12.659	0.000	0.046	0.063
Mortgage	-0.0001	0.001	-0.144	0.885	-0.002	0.002
Online	-0.4407	0.263	-1.674	0.094	-0.957	0.075
Securities Account	-1.8520	0.561	-3.299	0.001	-2.952	-0.752
const	-13.9203	2.773	-5.021	0.000	-19.354	-8.486

1
2


# performance measure
print("model AIC: ","{:.5f}".format(results.aic))

model AIC:  482.69329

1

results.params

Age                    0.024471
CCAvg                  0.098468
CD Account             4.372577
CreditCard            -1.237447
Education              1.520329
Experience            -0.007032
Family                 0.757911
Income                 0.054695
Mortgage              -0.000133
Online                -0.440746
Securities Account    -1.852006
const                -13.920298
dtype: float64

1
2
3
4
5
6


## 나이가 한살 많을수록록 대출할 확률이 1.024 높다.
## 수입이 1단위 높을소룩 대출할 확률이 1.05배 높다 
## 가족 구성원수가 1많을수록 대출할 확률이 2.13배 높다
## 경력이 1단위 높을수록 대출할 확률이 0.99배 높다(귀무가설 채택)
# Experience,  Mortgage는 제외할 필요성이 있어보임
np.exp(results.params)

Age                   1.024773e+00
CCAvg                 1.103479e+00
CD Account            7.924761e+01
CreditCard            2.901239e-01
Education             4.573729e+00
Experience            9.929928e-01
Family                2.133814e+00
Income                1.056218e+00
Mortgage              9.998665e-01
Online                6.435563e-01
Securities Account    1.569221e-01
const                 9.005163e-07
dtype: float64

1

pred_y = results.predict(test_x)

1
2
3
4
5
6
7


def cut_off(y,threshold):
    Y = y.copy() # copy함수를 사용하여 이전의 y값이 변화지 않게 함
    Y[Y>threshold]=1
    Y[Y<=threshold]=0
    return(Y.astype(int))

pred_Y = cut_off(pred_y,0.5)

1
2


cfmat = confusion_matrix(test_y,pred_Y)
print(cfmat)

[[661  12]
 [ 28  49]]

1

(cfmat[0,0]+cfmat[1,1])/np.sum(cfmat) ## accuracy

0.9466666666666667

1
2
3


def acc(cfmat) :
    acc=(cfmat[0,0]+cfmat[1,1])/np.sum(cfmat) ## accuracy
    return(acc)

임계값(cut-off)에 따른 성능지표 비교

1
2
3
4
5
6
7
8
9


threshold = np.arange(0,1,0.1)
table = pd.DataFrame(columns=['ACC'])
for i in threshold:
    pred_Y = cut_off(pred_y,i)
    cfmat = confusion_matrix(test_y, pred_Y)
    table.loc[i] = acc(cfmat)
table.index.name='threshold'
table.columns.name='performance'
table

performance	ACC
threshold
0.0	0.102667
0.1	0.908000
0.2	0.922667
0.3	0.933333
0.4	0.934667
0.5	0.946667
0.6	0.949333
0.7	0.946667
0.8	0.941333
0.9	0.937333

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


# sklearn ROC 패키지 제공
fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y, pos_label=1)

# Print ROC curve
plt.plot(fpr,tpr)

# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9463923891858513

1
2
3


feature_columns = list(ploan_processed.columns.difference(["Personal Loan","Experience",  "Mortgage"]))
X = ploan_processed[feature_columns]
y = ploan_processed['Personal Loan'] # 대출여부: 1 or 0

1
2


train_x2, test_x2, train_y, test_y = train_test_split(X, y, stratify=y,train_size=0.7,test_size=0.3,random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1750, 12) (750, 12) (1750,) (750,)

1
2


model = sm.Logit(train_y, train_x2)
results2 = model.fit(method='newton')

Optimization terminated successfully.
         Current function value: 0.131062
         Iterations 9

1

results2.summary()

Logit Regression Results
Dep. Variable:	Personal Loan	No. Observations:	1750
Model:	Logit	Df Residuals:	1740
Method:	MLE	Df Model:	9
Date:	Fri, 23 Aug 2019	Pseudo R-squ.:	0.6029
Time:	14:58:19	Log-Likelihood:	-229.36
converged:	True	LL-Null:	-577.63
		LLR p-value:	3.817e-144

	coef	std err	z	P>\|z\|	[0.025	0.975]
Age	0.0174	0.011	1.569	0.117	-0.004	0.039
CCAvg	0.0997	0.062	1.596	0.111	-0.023	0.222
CD Account	4.3699	0.567	7.705	0.000	3.258	5.481
CreditCard	-1.2350	0.337	-3.668	0.000	-1.895	-0.575
Education	1.5249	0.187	8.156	0.000	1.158	1.891
Family	0.7572	0.127	5.948	0.000	0.508	1.007
Income	0.0546	0.004	12.833	0.000	0.046	0.063
Online	-0.4418	0.263	-1.678	0.093	-0.958	0.074
Securities Account	-1.8526	0.561	-3.302	0.001	-2.952	-0.753
const	-13.7465	1.164	-11.814	0.000	-16.027	-11.466

1

results.summary()

Logit Regression Results
Dep. Variable:	Personal Loan	No. Observations:	1750
Model:	Logit	Df Residuals:	1738
Method:	MLE	Df Model:	11
Date:	Fri, 23 Aug 2019	Pseudo R-squ.:	0.6030
Time:	14:58:19	Log-Likelihood:	-229.35
converged:	True	LL-Null:	-577.63
		LLR p-value:	2.927e-142

	coef	std err	z	P>\|z\|	[0.025	0.975]
Age	0.0245	0.102	0.240	0.810	-0.175	0.224
CCAvg	0.0985	0.063	1.562	0.118	-0.025	0.222
CD Account	4.3726	0.568	7.703	0.000	3.260	5.485
CreditCard	-1.2374	0.337	-3.667	0.000	-1.899	-0.576
Education	1.5203	0.190	7.999	0.000	1.148	1.893
Experience	-0.0070	0.102	-0.069	0.945	-0.206	0.192
Family	0.7579	0.128	5.914	0.000	0.507	1.009
Income	0.0547	0.004	12.659	0.000	0.046	0.063
Mortgage	-0.0001	0.001	-0.144	0.885	-0.002	0.002
Online	-0.4407	0.263	-1.674	0.094	-0.957	0.075
Securities Account	-1.8520	0.561	-3.299	0.001	-2.952	-0.752
const	-13.9203	2.773	-5.021	0.000	-19.354	-8.486

1

pred_y = results2.predict(test_x2)

1

pred_Y = cut_off(pred_y,0.5)

1
2


cfmat = confusion_matrix(test_y,pred_Y)
print(cfmat)

[[660  13]
 [ 29  48]]

1

acc(cfmat) ## accuracy

0.944

1
2
3
4
5
6
7
8
9


threshold = np.arange(0,1,0.1)
table = pd.DataFrame(columns=['ACC'])
for i in threshold:
    pred_Y = cut_off(pred_y,i)
    cfmat = confusion_matrix(test_y, pred_Y)
    table.loc[i] =acc(cfmat)
table.index.name='threshold'
table.columns.name='performance'
table

performance	ACC
threshold
0.0	0.102667
0.1	0.908000
0.2	0.922667
0.3	0.932000
0.4	0.936000
0.5	0.944000
0.6	0.949333
0.7	0.946667
0.8	0.941333
0.9	0.937333

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


# sklearn ROC 패키지 제공
fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y, pos_label=1)

# Print ROC curve
plt.plot(fpr,tpr)

# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9465467667547905

변수선택법

1
2
3


feature_columns = list(ploan_processed.columns.difference(["Personal Loan"]))
X = ploan_processed[feature_columns]
y = ploan_processed['Personal Loan'] # 대출여부: 1 or 0

1
2


train_x, test_x, train_y, test_y = train_test_split(X, y, stratify=y,train_size=0.7,test_size=0.3,random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1750, 12) (750, 12) (1750,) (750,)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116


def processSubset(X,y, feature_set):
            model = sm.Logit(y,X[list(feature_set)])
            regr = model.fit()
            AIC = regr.aic
            return {"model":regr, "AIC":AIC}
        
'''
전진선택법
'''
def forward(X, y, predictors):
    # 데이터 변수들이 미리정의된 predictors에 있는지 없는지 확인 및 분류
    remaining_predictors = [p for p in X.columns.difference(['const']) if p not in predictors]
    tic = time.time()
    results = []
    for p in remaining_predictors:
        results.append(processSubset(X=X, y= y, feature_set=predictors+[p]+['const']))
    # 데이터프레임으로 변환
    models = pd.DataFrame(results)

    # AIC가 가장 낮은 것을 선택
    best_model = models.loc[models['AIC'].argmin()] # index
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic))
    print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0] )
    return best_model

def forward_model(X,y):
    Fmodels = pd.DataFrame(columns=["AIC", "model"])
    tic = time.time()
    # 미리 정의된 데이터 변수
    predictors = []
    # 변수 1~10개 : 0~9 -> 1~10
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X,y=y,predictors=predictors)
        if i > 1:
            if Forward_result['AIC'] > Fmodel_before:
                break
        Fmodels.loc[i] = Forward_result
        predictors = Fmodels.loc[i]["model"].model.exog_names
        Fmodel_before = Fmodels.loc[i]["AIC"]
        predictors = [ k for k in predictors if k != 'const']
    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")

    return(Fmodels['model'][len(Fmodels['model'])])


'''
후진소거법
'''
def backward(X,y,predictors):
    tic = time.time()
    results = []
    
    # 데이터 변수들이 미리정의된 predictors 조합 확인
    for combo in itertools.combinations(predictors, len(predictors) - 1):
        results.append(processSubset(X=X, y= y,feature_set=list(combo)+['const']))
    models = pd.DataFrame(results)
    
    # 가장 낮은 AIC를 가진 모델을 선택
    best_model = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors) - 1, "predictors in",
          (toc - tic))
    print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0] )
    return best_model


def backward_model(X, y):
    Bmodels = pd.DataFrame(columns=["AIC", "model"], index = range(1,len(X.columns)))
    tic = time.time()
    predictors = X.columns.difference(['const'])
    Bmodel_before = processSubset(X,y,predictors)['AIC']
    while (len(predictors) > 1):
        Backward_result = backward(X=train_x, y= train_y, predictors = predictors)
        if Backward_result['AIC'] > Bmodel_before:
            break
        Bmodels.loc[len(predictors) - 1] = Backward_result
        predictors = Bmodels.loc[len(predictors) - 1]["model"].model.exog_names
        Bmodel_before = Backward_result['AIC']
        predictors = [ k for k in predictors if k != 'const']

    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")
    return (Bmodels['model'].dropna().iloc[0])


'''
단계적 선택법
'''
def Stepwise_model(X,y):
    Stepmodels = pd.DataFrame(columns=["AIC", "model"])
    tic = time.time()
    predictors = []
    Smodel_before = processSubset(X,y,predictors+['const'])['AIC']
    # 변수 1~10개 : 0~9 -> 1~10
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X, y=y, predictors=predictors) # constant added
        print('forward')
        Stepmodels.loc[i] = Forward_result
        predictors = Stepmodels.loc[i]["model"].model.exog_names
        predictors = [ k for k in predictors if k != 'const']
        Backward_result = backward(X=X, y=y, predictors=predictors)
        if Backward_result['AIC']< Forward_result['AIC']:
            Stepmodels.loc[i] = Backward_result
            predictors = Stepmodels.loc[i]["model"].model.exog_names
            Smodel_before = Stepmodels.loc[i]["AIC"]
            predictors = [ k for k in predictors if k != 'const']
            print('backward')
        if Stepmodels.loc[i]['AIC']> Smodel_before:
            break
        else:
            Smodel_before = Stepmodels.loc[i]["AIC"]
    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")
    return (Stepmodels['model'][len(Stepmodels['model'])])

1

Forward_best_model = forward_model(X=train_x, y= train_y)

1

Backward_best_model = backward_model(X=train_x,y=train_y)

1

Stepwise_best_model = Stepwise_model(X=train_x,y=train_y)

Total elapsed time: 0.9743940830230713 seconds.

1
2
3
4


pred_y_full = results2.predict(test_x2) # full model
pred_y_forward = Forward_best_model.predict(test_x[Forward_best_model.model.exog_names])
pred_y_backward = Backward_best_model.predict(test_x[Backward_best_model.model.exog_names])
pred_y_stepwise = Stepwise_best_model.predict(test_x[Stepwise_best_model.model.exog_names])

1
2
3
4


pred_Y_full= cut_off(pred_y_full,0.5)
pred_Y_forward = cut_off(pred_y_forward,0.5)
pred_Y_backward = cut_off(pred_y_backward,0.5)
pred_Y_stepwise = cut_off(pred_y_stepwise,0.5)

1
2
3
4


cfmat_full = confusion_matrix(test_y, pred_Y_full)
cfmat_forward = confusion_matrix(test_y, pred_Y_forward)
cfmat_backward = confusion_matrix(test_y, pred_Y_backward)
cfmat_stepwise = confusion_matrix(test_y, pred_Y_stepwise)

1
2
3
4
5


print(acc(cfmat_full))
print(acc(cfmat_forward))
print(acc(cfmat_backward))
print(acc(cfmat_stepwise))

1
2
3
4
5
6


fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y_full, pos_label=1)
# Print ROC curve
plt.plot(fpr,tpr)
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9465467667547905

png

1
2
3
4
5
6


fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y_forward, pos_label=1)
# Print ROC curve
plt.plot(fpr,tpr)
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9465467667547905

png

1
2
3
4
5
6


fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y_backward, pos_label=1)
# Print ROC curve
plt.plot(fpr,tpr)
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9465467667547905

1
2
3
4
5
6


fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y_stepwise, pos_label=1)
# Print ROC curve
plt.plot(fpr,tpr)
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9465467667547905

png

1

###성능면에서는 네 모델이 큰 차이가 없음

Lasso & RIdge

1

from sklearn.linear_model import Ridge, Lasso, ElasticNet

1
2
3
4
5
6
7
8
9



ploan_processed = ploan.dropna().drop(['ID','ZIP Code'], axis=1, inplace=False)

feature_columns = list(ploan_processed.columns.difference(["Personal Loan"]))
X = ploan_processed[feature_columns]
y = ploan_processed['Personal Loan'] # 대출여부: 1 or 0

train_x, test_x, train_y, test_y = train_test_split(X, y, stratify=y,train_size=0.7,test_size=0.3,random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1750, 11) (750, 11) (1750,) (750,)

1
2
3


ll =Lasso(alpha=0.01) ## lasso
ll.fit(train_x,train_y)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

1

ll.coef_

array([ 0.00000000e+00,  2.04783983e-03,  1.14390390e-01, -0.00000000e+00,
        6.58342418e-02,  4.76625359e-04,  3.13396711e-02,  3.55393865e-03,
        1.31719530e-05,  0.00000000e+00, -0.00000000e+00])

1

results.summary()

Logit Regression Results
Dep. Variable:	Personal Loan	No. Observations:	1750
Model:	Logit	Df Residuals:	1738
Method:	MLE	Df Model:	11
Date:	Fri, 23 Aug 2019	Pseudo R-squ.:	0.6030
Time:	14:58:38	Log-Likelihood:	-229.35
converged:	True	LL-Null:	-577.63
		LLR p-value:	2.927e-142

	coef	std err	z	P>\|z\|	[0.025	0.975]
Age	0.0245	0.102	0.240	0.810	-0.175	0.224
CCAvg	0.0985	0.063	1.562	0.118	-0.025	0.222
CD Account	4.3726	0.568	7.703	0.000	3.260	5.485
CreditCard	-1.2374	0.337	-3.667	0.000	-1.899	-0.576
Education	1.5203	0.190	7.999	0.000	1.148	1.893
Experience	-0.0070	0.102	-0.069	0.945	-0.206	0.192
Family	0.7579	0.128	5.914	0.000	0.507	1.009
Income	0.0547	0.004	12.659	0.000	0.046	0.063
Mortgage	-0.0001	0.001	-0.144	0.885	-0.002	0.002
Online	-0.4407	0.263	-1.674	0.094	-0.957	0.075
Securities Account	-1.8520	0.561	-3.299	0.001	-2.952	-0.752
const	-13.9203	2.773	-5.021	0.000	-19.354	-8.486

1
2
3
4
5


pred_y_lasso = ll.predict(test_x) # full model
pred_Y_lasso= cut_off(pred_y_lasso,0.5)
cfmat = confusion_matrix(test_y, pred_Y_lasso)
print(acc(cfmat))

0.936

1
2
3
4
5
6


fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y_lasso, pos_label=1)
# Print ROC curve
plt.plot(fpr,tpr)
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9439995368672932

png

1
2
3


rr =Ridge(alpha=0.01) ## lasso
rr.fit(train_x,train_y)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

1

rr.coef_ ## ridge result

array([-3.71283678e-03,  7.37570775e-03,  3.54973975e-01, -5.28579506e-02,
        7.83404224e-02,  4.12823466e-03,  3.62504712e-02,  3.27385112e-03,
        1.73105480e-06, -1.91297381e-02, -8.77388670e-02])

1

ll.coef_ ## lasso result

array([ 0.00000000e+00,  2.04783983e-03,  1.14390390e-01, -0.00000000e+00,
        6.58342418e-02,  4.76625359e-04,  3.13396711e-02,  3.55393865e-03,
        1.31719530e-05,  0.00000000e+00, -0.00000000e+00])

1
2
3
4
5


pred_y_ridge = rr.predict(test_x) # full model
pred_Y_ridge= cut_off(pred_y_ridge,0.5)
cfmat = confusion_matrix(test_y, pred_Y_lasso)
print(acc(cfmat))

0.936

1
2
3
4
5
6


fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y_ridge, pos_label=1)
# Print ROC curve
plt.plot(fpr,tpr)
# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

AUC: 0.9494992377607533

png

1
2


alpha = np.logspace(-3, 1, 5)
alpha

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


data = []
acc_table=[]
for i, a in enumerate(alpha):
    lasso = Lasso(alpha=a).fit(train_x, train_y)
    data.append(pd.Series(np.hstack([lasso.intercept_, lasso.coef_])))
    pred_y = lasso.predict(test_x) # full model
    pred_y= cut_off(pred_y,0.5)
    cfmat = confusion_matrix(test_y, pred_y)
    acc_table.append((acc(cfmat)))
    

df_lasso = pd.DataFrame(data, index=alpha).T
df_lasso
acc_table_lasso = pd.DataFrame(acc_table, index=alpha).T

1

df_lasso

	0.001	0.01	0.1	1.0	10.0
0	0.932	0.936	0.894667	0.897333	0.897333

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


data = []
acc_table=[]
for i, a in enumerate(alpha):
    ridge = Ridge(alpha=a).fit(train_x, train_y)
    data.append(pd.Series(np.hstack([ridge.intercept_, ridge.coef_])))
    pred_y = ridge.predict(test_x) # full model
    pred_y= cut_off(pred_y,0.5)
    cfmat = confusion_matrix(test_y, pred_y)
    acc_table.append((acc(cfmat)))

    
df_ridge = pd.DataFrame(data, index=alpha).T
acc_table_ridge = pd.DataFrame(acc_table, index=alpha).T

1

df_ridge

	0.001	0.01	0.1	1.0	10.0
0	-0.289557	-0.289565	-0.289645	-0.290438	-0.297581
1	-0.003713	-0.003713	-0.003713	-0.003716	-0.003723
2	0.007376	0.007376	0.007376	0.007378	0.007388
3	0.355019	0.354974	0.354529	0.350141	0.311781
4	-0.052866	-0.052858	-0.052782	-0.052037	-0.045541
5	0.078340	0.078340	0.078341	0.078347	0.078316
6	0.004128	0.004128	0.004129	0.004136	0.004175
7	0.036250	0.036250	0.036254	0.036289	0.036578
8	0.003274	0.003274	0.003274	0.003278	0.003313
9	0.000002	0.000002	0.000002	0.000002	0.000004
10	-0.019134	-0.019130	-0.019086	-0.018655	-0.014925
11	-0.087756	-0.087739	-0.087569	-0.085897	-0.071545

1

acc_table_ridge

	0.001	0.01	0.1	1.0	10.0
0	0.932	0.932	0.932	0.932	0.932

labmda값의 변화에 따른 회귀계수 축소 시각화

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


import matplotlib.pyplot as plt
ax1 = plt.subplot(121)
plt.semilogx(df_ridge.T)
plt.xticks(alpha)

ax2 = plt.subplot(122)
plt.semilogx(df_lasso.T)
plt.xticks(alpha)
plt.title("Lasso")

plt.show()

Load Comments?