In [1]:
# Project 2 Decision Support System

# MSMA 12B
# Team: Shulian Guan, Ziting Liao, Xiaoran Li, Yiwei Zhang, Zheng Lian
In [2]:
!pwd
C:\Users\DELL\Desktop\Rochester\python group final
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
Data=pd.read_csv("heloc_dataset_v1.csv")

# 1. explore the whole dataset
Data.head(10)
Out[5]:
RiskPerformance ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec PercentTradesNeverDelq MSinceMostRecentDelq ... PercentInstallTrades MSinceMostRecentInqexcl7days NumInqLast6M NumInqLast6Mexcl7days NetFractionRevolvingBurden NetFractionInstallBurden NumRevolvingTradesWBalance NumInstallTradesWBalance NumBank2NatlTradesWHighUtilization PercentTradesWBalance
0 Bad 55 144 4 84 20 3 0 83 2 ... 43 0 0 0 33 -8 8 1 1 69
1 Bad 61 58 15 41 2 4 4 100 -7 ... 67 0 0 0 0 -8 0 -8 -8 0
2 Bad 67 66 5 24 9 0 0 100 -7 ... 44 0 4 4 53 66 4 2 1 86
3 Bad 66 169 1 73 28 1 1 93 76 ... 57 0 5 4 72 83 6 4 3 91
4 Bad 81 333 27 132 12 0 0 100 -7 ... 25 0 1 1 51 89 3 1 0 80
5 Bad 59 137 11 78 31 0 0 91 1 ... 47 0 0 0 62 93 12 4 3 94
6 Good 54 88 7 37 25 0 0 92 9 ... 58 0 4 4 89 76 7 7 2 100
7 Good 68 148 7 65 17 0 0 83 31 ... 44 0 0 0 28 48 2 2 2 40
8 Bad 59 324 2 138 24 0 0 85 5 ... 26 0 1 1 68 -8 7 1 3 90
9 Bad 61 79 4 36 19 0 0 95 5 ... 26 0 6 6 31 86 5 3 1 62

10 rows × 24 columns

In [26]:
Data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10459 entries, 0 to 10458
Data columns (total 24 columns):
RiskPerformance                       10459 non-null object
ExternalRiskEstimate                  10459 non-null int64
MSinceOldestTradeOpen                 10459 non-null int64
MSinceMostRecentTradeOpen             10459 non-null int64
AverageMInFile                        10459 non-null int64
NumSatisfactoryTrades                 10459 non-null int64
NumTrades60Ever2DerogPubRec           10459 non-null int64
NumTrades90Ever2DerogPubRec           10459 non-null int64
PercentTradesNeverDelq                10459 non-null int64
MSinceMostRecentDelq                  10459 non-null int64
MaxDelq2PublicRecLast12M              10459 non-null int64
MaxDelqEver                           10459 non-null int64
NumTotalTrades                        10459 non-null int64
NumTradesOpeninLast12M                10459 non-null int64
PercentInstallTrades                  10459 non-null int64
MSinceMostRecentInqexcl7days          10459 non-null int64
NumInqLast6M                          10459 non-null int64
NumInqLast6Mexcl7days                 10459 non-null int64
NetFractionRevolvingBurden            10459 non-null int64
NetFractionInstallBurden              10459 non-null int64
NumRevolvingTradesWBalance            10459 non-null int64
NumInstallTradesWBalance              10459 non-null int64
NumBank2NatlTradesWHighUtilization    10459 non-null int64
PercentTradesWBalance                 10459 non-null int64
dtypes: int64(23), object(1)
memory usage: 1.9+ MB
In [27]:
Data.describe()
Out[27]:
ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec PercentTradesNeverDelq MSinceMostRecentDelq MaxDelq2PublicRecLast12M ... PercentInstallTrades MSinceMostRecentInqexcl7days NumInqLast6M NumInqLast6Mexcl7days NetFractionRevolvingBurden NetFractionInstallBurden NumRevolvingTradesWBalance NumInstallTradesWBalance NumBank2NatlTradesWHighUtilization PercentTradesWBalance
count 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 ... 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000
mean 67.425758 184.205373 8.543455 73.843293 19.428052 0.042738 -0.142843 86.661536 6.762406 4.928291 ... 32.166460 -0.325366 0.868152 0.812602 31.629888 39.158906 3.185008 0.976097 0.018071 62.079166
std 21.121621 109.683816 13.301745 38.782803 13.004327 2.513910 2.367397 25.999584 20.501250 3.756275 ... 20.128634 6.067556 3.179304 3.143698 30.060140 42.101601 4.413173 4.060995 3.358135 27.711565
min -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 ... -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000
25% 63.000000 118.000000 3.000000 52.000000 12.000000 0.000000 0.000000 87.000000 -7.000000 4.000000 ... 20.000000 -7.000000 0.000000 0.000000 5.000000 -8.000000 2.000000 1.000000 0.000000 47.000000
50% 71.000000 178.000000 5.000000 74.000000 19.000000 0.000000 0.000000 96.000000 -7.000000 6.000000 ... 31.000000 0.000000 1.000000 1.000000 25.000000 47.000000 3.000000 2.000000 0.000000 67.000000
75% 79.000000 249.500000 11.000000 95.000000 27.000000 1.000000 0.000000 100.000000 14.000000 7.000000 ... 44.000000 1.000000 2.000000 2.000000 54.000000 79.000000 5.000000 3.000000 1.000000 82.000000
max 94.000000 803.000000 383.000000 383.000000 79.000000 19.000000 19.000000 100.000000 83.000000 9.000000 ... 100.000000 24.000000 66.000000 66.000000 232.000000 471.000000 32.000000 23.000000 18.000000 100.000000

8 rows × 23 columns

In [28]:
Data['RiskPerformance'].value_counts() 
Out[28]:
Bad     5459
Good    5000
Name: RiskPerformance, dtype: int64
In [29]:
# 2. data cleaning

# deal with missing value
for col in Data.columns:
    print(sum(Data[col]==-7),sum(Data[col]==-8), sum(Data[col]==-9), col)
0 0 0 RiskPerformance
0 0 598 ExternalRiskEstimate
0 239 588 MSinceOldestTradeOpen
0 0 588 MSinceMostRecentTradeOpen
0 0 588 AverageMInFile
0 0 588 NumSatisfactoryTrades
0 0 588 NumTrades60Ever2DerogPubRec
0 0 588 NumTrades90Ever2DerogPubRec
0 0 588 PercentTradesNeverDelq
4664 176 588 MSinceMostRecentDelq
0 0 588 MaxDelq2PublicRecLast12M
0 0 588 MaxDelqEver
0 0 588 NumTotalTrades
0 0 588 NumTradesOpeninLast12M
0 0 588 PercentInstallTrades
1855 476 588 MSinceMostRecentInqexcl7days
0 0 588 NumInqLast6M
0 0 588 NumInqLast6Mexcl7days
0 186 588 NetFractionRevolvingBurden
0 3419 588 NetFractionInstallBurden
0 156 588 NumRevolvingTradesWBalance
0 861 588 NumInstallTradesWBalance
0 583 588 NumBank2NatlTradesWHighUtilization
0 18 588 PercentTradesWBalance
In [30]:
# drop rows with all '-9'
Data = Data.replace(-9,np.nan)
Data = Data.dropna(axis=0)
In [31]:
Data.shape
Out[31]:
(9861, 24)
In [32]:
# deal with categorical variables
Data = pd.get_dummies(Data, columns=['MaxDelq2PublicRecLast12M'], drop_first=False)
Data = pd.get_dummies(Data, columns=['MaxDelqEver'], drop_first=False)
In [33]:
# convert the target variable to boolean
Data['RiskPerformance'] = Data['RiskPerformance'].replace("Bad",0)
Data['RiskPerformance'] = Data['RiskPerformance'].replace("Good",1)
In [34]:
Data.head(5)
Out[34]:
RiskPerformance ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec PercentTradesNeverDelq MSinceMostRecentDelq ... MaxDelq2PublicRecLast12M_6.0 MaxDelq2PublicRecLast12M_7.0 MaxDelq2PublicRecLast12M_9.0 MaxDelqEver_2.0 MaxDelqEver_3.0 MaxDelqEver_4.0 MaxDelqEver_5.0 MaxDelqEver_6.0 MaxDelqEver_7.0 MaxDelqEver_8.0
0 0 55.0 144.0 4.0 84.0 20.0 3.0 0.0 83.0 2.0 ... 0 0 0 0 0 0 1 0 0 0
1 0 61.0 58.0 15.0 41.0 2.0 4.0 4.0 100.0 -7.0 ... 0 0 0 0 0 0 0 0 0 1
2 0 67.0 66.0 5.0 24.0 9.0 0.0 0.0 100.0 -7.0 ... 0 1 0 0 0 0 0 0 0 1
3 0 66.0 169.0 1.0 73.0 28.0 1.0 1.0 93.0 76.0 ... 1 0 0 0 0 0 0 1 0 0
4 0 81.0 333.0 27.0 132.0 12.0 0.0 0.0 100.0 -7.0 ... 0 1 0 0 0 0 0 0 0 1

5 rows × 38 columns

In [10]:
for col in Data.columns: 
    print(col) 
RiskPerformance
ExternalRiskEstimate
MSinceOldestTradeOpen
MSinceMostRecentTradeOpen
AverageMInFile
NumSatisfactoryTrades
NumTrades60Ever2DerogPubRec
NumTrades90Ever2DerogPubRec
PercentTradesNeverDelq
MSinceMostRecentDelq
MaxDelq2PublicRecLast12M
MaxDelqEver
NumTotalTrades
NumTradesOpeninLast12M
PercentInstallTrades
MSinceMostRecentInqexcl7days
NumInqLast6M
NumInqLast6Mexcl7days
NetFractionRevolvingBurden
NetFractionInstallBurden
NumRevolvingTradesWBalance
NumInstallTradesWBalance
NumBank2NatlTradesWHighUtilization
PercentTradesWBalance
In [35]:
Data.iloc[:,19:25].describe()
Out[35]:
NumInstallTradesWBalance NumBank2NatlTradesWHighUtilization PercentTradesWBalance MaxDelq2PublicRecLast12M_0.0 MaxDelq2PublicRecLast12M_1.0 MaxDelq2PublicRecLast12M_2.0
count 9861.000000 9861.000000 9861.000000 9861.000000 9861.000000 9861.000000
mean 1.577021 0.560998 66.371768 0.033871 0.005476 0.005273
std 3.337305 2.601734 22.162833 0.180906 0.073802 0.072429
min -8.000000 -8.000000 -8.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 50.000000 0.000000 0.000000 0.000000
50% 2.000000 1.000000 67.000000 0.000000 0.000000 0.000000
75% 3.000000 1.000000 83.000000 0.000000 0.000000 0.000000
max 23.000000 18.000000 100.000000 1.000000 1.000000 1.000000
In [16]:
for col in Data.columns:
    print(sum(Data[col]==-7),sum(Data[col]==-8), sum(Data[col]==-9), col)
0 0 0 RiskPerformance
0 0 0 ExternalRiskEstimate
0 239 0 MSinceOldestTradeOpen
0 0 0 MSinceMostRecentTradeOpen
0 0 0 AverageMInFile
0 0 0 NumSatisfactoryTrades
0 0 0 NumTrades60Ever2DerogPubRec
0 0 0 NumTrades90Ever2DerogPubRec
0 0 0 PercentTradesNeverDelq
4658 176 0 MSinceMostRecentDelq
0 0 0 NumTotalTrades
0 0 0 NumTradesOpeninLast12M
0 0 0 PercentInstallTrades
1853 476 0 MSinceMostRecentInqexcl7days
0 0 0 NumInqLast6M
0 0 0 NumInqLast6Mexcl7days
0 179 0 NetFractionRevolvingBurden
0 3412 0 NetFractionInstallBurden
0 149 0 NumRevolvingTradesWBalance
0 854 0 NumInstallTradesWBalance
0 576 0 NumBank2NatlTradesWHighUtilization
0 11 0 PercentTradesWBalance
0 0 0 MaxDelq2PublicRecLast12M_0.0
0 0 0 MaxDelq2PublicRecLast12M_1.0
0 0 0 MaxDelq2PublicRecLast12M_2.0
0 0 0 MaxDelq2PublicRecLast12M_3.0
0 0 0 MaxDelq2PublicRecLast12M_4.0
0 0 0 MaxDelq2PublicRecLast12M_5.0
0 0 0 MaxDelq2PublicRecLast12M_6.0
0 0 0 MaxDelq2PublicRecLast12M_7.0
0 0 0 MaxDelq2PublicRecLast12M_9.0
0 0 0 MaxDelqEver_2.0
0 0 0 MaxDelqEver_3.0
0 0 0 MaxDelqEver_4.0
0 0 0 MaxDelqEver_5.0
0 0 0 MaxDelqEver_6.0
0 0 0 MaxDelqEver_7.0
0 0 0 MaxDelqEver_8.0
In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from IPython.display import Image 
from IPython.display import IFrame 
import warnings
warnings.filterwarnings("ignore")
In [37]:
# 3. Create train set, test set

train_set, test_set = train_test_split(Data, test_size=0.2, random_state=1)

train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

train_set.to_csv('TrainData.csv',index=False)
test_set.to_csv('TestData.csv',index=False)

print(train_set.shape, test_set.shape)

X = train_set.copy().drop("RiskPerformance", axis=1)
Y = train_set["RiskPerformance"].copy()
X_test = test_set.copy().drop("RiskPerformance", axis=1)
Y_test = test_set["RiskPerformance"].copy()
(7888, 38) (1973, 38)
In [ ]:
# 4. Run models

# 4.1 Logistic Regression

lin_reg = LogisticRegression()
scores = cross_val_score(lin_reg, X, Y, cv=10)
print("Train Set Accuracy:", scores.mean())
clf = lin_reg.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test, Y_test))
In [20]:
# 4.2 Decision Tree Classifier

param_grid = [{'max_depth':[3,5,7,9,11],'max_features':[2,4,6,8,10,12]}]
tree_clf = DecisionTreeClassifier(random_state=1)
grid_search = GridSearchCV(tree_clf, param_grid, cv=5)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_tree_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_tree_clf.score(X_test,Y_test))
{'max_depth': 5, 'max_features': 12}
Train Set Accuracy: 0.7137423935091278
Test Set Accuracy: 0.7045108971109985
In [21]:
# 4.3 Random Forest Classifier

param_grid = [{'n_estimators':[70,80,90,100],'max_features':[2,4,6,8,10]}]
rf_clf = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(rf_clf, param_grid, cv=4)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_rf_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_rf_clf.score(X_test,Y_test))
{'max_features': 4, 'n_estimators': 90}
Train Set Accuracy: 0.7363083164300203
Test Set Accuracy: 0.7131272174353775
In [ ]:
#Feature importance
clf_rf = RandomForestClassifier(max_features="sqrt", n_estimators=50, max_depth=1)
clf_rf = clf_rf.fit(X,Y)
# plot feature importance
pd.Series(data=clf_rf.feature_importances_, index=X.columns).sort_values().plot.bar()
In [22]:
# 4.4 KNN

param_grid = [{'n_neighbors':[5,10,20,25,30,40]}]
knn_clf = neighbors.KNeighborsClassifier()  
grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_knn_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_knn_clf.score(X_test,Y_test))
{'n_neighbors': 30}
Train Set Accuracy: 0.7058823529411765
Test Set Accuracy: 0.6882919412062849
In [23]:
# 4.5 Linear Discriminant Analysis

param_grid = [{'n_components':[2,4,6,8,10]}]
lda_clf = LinearDiscriminantAnalysis()  
grid_search = GridSearchCV(lda_clf, param_grid, cv=5)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_lda_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_lda_clf.score(X_test,Y_test))
{'n_components': 2}
Train Set Accuracy: 0.7325050709939148
Test Set Accuracy: 0.7192093258996453
In [25]:
# 4.6 BaggingClassifier based on decision tree classifier

tree_depth = 5
base_clf = tree.DecisionTreeClassifier(max_depth=tree_depth) # base classifier
results = []
n_range = range(1,100,1)
for n in n_range:
    clf_bagging = BaggingClassifier(n_estimators=n, base_estimator=base_clf)
    scores = cross_val_score(clf_bagging, X, Y, cv=5)

    results.append((n,scores.mean(), scores.std()))
    
df_bagging = pd.DataFrame(data=results,columns=['n','Bagging accuracy','Bagging error'])
df_bagging
Out[25]:
n Bagging accuracy Bagging error
0 1 0.706138 0.008542
1 2 0.712862 0.016594
2 3 0.721227 0.009569
3 4 0.725159 0.013392
4 5 0.727946 0.011011
5 6 0.727567 0.014109
6 7 0.730737 0.014440
7 8 0.727314 0.015260
8 9 0.726299 0.012478
9 10 0.726679 0.012309
10 11 0.729723 0.014433
11 12 0.726679 0.014091
12 13 0.727945 0.008753
13 14 0.727438 0.009028
14 15 0.728327 0.011403
15 16 0.729468 0.011711
16 17 0.726553 0.013607
17 18 0.730103 0.013419
18 19 0.728707 0.011922
19 20 0.724904 0.012932
20 21 0.732384 0.012878
21 22 0.730864 0.014698
22 23 0.730482 0.012465
23 24 0.730735 0.012298
24 25 0.728706 0.010538
25 26 0.730989 0.011192
26 27 0.730864 0.014951
27 28 0.731750 0.012530
28 29 0.729847 0.010302
29 30 0.727566 0.010858
... ... ... ...
69 70 0.729596 0.014817
70 71 0.731117 0.012680
71 72 0.729976 0.014267
72 73 0.731116 0.011510
73 74 0.729469 0.014071
74 75 0.727821 0.013465
75 76 0.729469 0.013513
76 77 0.730483 0.013305
77 78 0.729089 0.014895
78 79 0.728201 0.012370
79 80 0.729087 0.010587
80 81 0.729723 0.014468
81 82 0.730229 0.012803
82 83 0.729595 0.013250
83 84 0.728074 0.013100
84 85 0.730737 0.014002
85 86 0.730229 0.012098
86 87 0.729723 0.014824
87 88 0.730610 0.013748
88 89 0.730229 0.011379
89 90 0.727440 0.012204
90 91 0.730610 0.013784
91 92 0.731878 0.014701
92 93 0.729214 0.012053
93 94 0.730102 0.013185
94 95 0.731496 0.011933
95 96 0.728455 0.014197
96 97 0.729343 0.015047
97 98 0.729469 0.013431
98 99 0.729089 0.014867

99 rows × 3 columns

In [26]:
print(df_bagging['Bagging accuracy'].argmax()+1,df_bagging['Bagging accuracy'].max())
43 0.7325110263871277
In [18]:
# run bagging model in the entire train set
best_bag_clf = BaggingClassifier(n_estimators=59, base_estimator=base_clf)
clf = best_bag_clf.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test,Y_test))
Test Set Accuracy: 0.7181956411556006
In [28]:
# 4.7 AdaBoostClassifier

res_boosting_mean = []
res_boosting_std = []
n_range = range(1,100,1)
for n in n_range:
    clf_boosting = AdaBoostClassifier(n_estimators=n, learning_rate=0.5)
    clf_boosting_scores = cross_val_score(clf_boosting, X, Y, cv=5)
    res_boosting_mean.append(clf_boosting_scores.mean())
    res_boosting_std.append(clf_boosting_scores.std())
df_boosting = pd.DataFrame({'Boosting accuracy':res_boosting_mean,'Boosting error':res_boosting_std},index=n_range)
df_boosting
Out[28]:
Boosting accuracy Boosting error
1 0.705507 0.011043
2 0.705507 0.011043
3 0.705507 0.011043
4 0.705507 0.011043
5 0.715271 0.015084
6 0.712735 0.015279
7 0.714764 0.015588
8 0.716793 0.016472
9 0.718947 0.014641
10 0.723131 0.014620
11 0.725286 0.014544
12 0.726302 0.018030
13 0.727441 0.015887
14 0.726555 0.017896
15 0.726935 0.016728
16 0.728456 0.016254
17 0.726680 0.014817
18 0.727694 0.014758
19 0.726554 0.015898
20 0.728076 0.016556
21 0.728075 0.015710
22 0.727695 0.015446
23 0.728328 0.013742
24 0.727694 0.014041
25 0.728709 0.014306
26 0.729597 0.015463
27 0.729850 0.015942
28 0.729470 0.015229
29 0.729851 0.015572
30 0.730611 0.015428
... ... ...
70 0.730992 0.016407
71 0.732133 0.016564
72 0.732514 0.017077
73 0.732641 0.017595
74 0.732767 0.016195
75 0.733147 0.016999
76 0.731753 0.017338
77 0.730739 0.016760
78 0.731753 0.016916
79 0.732006 0.016530
80 0.732133 0.015871
81 0.732006 0.016434
82 0.732133 0.016780
83 0.732513 0.016441
84 0.733147 0.015991
85 0.733274 0.016180
86 0.732894 0.016171
87 0.733907 0.015547
88 0.734034 0.015777
89 0.734161 0.015584
90 0.734034 0.015823
91 0.733654 0.016016
92 0.734415 0.016461
93 0.734668 0.016363
94 0.734922 0.016488
95 0.734162 0.016817
96 0.734922 0.016280
97 0.735302 0.016207
98 0.735809 0.014774
99 0.735556 0.015878

99 rows × 2 columns

In [34]:
df_boosting["Boosting accuracy"].plot(title = 'Boosting accuracy', x = 'n_estimators', y = 'accuracy')
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1c17b502b0>
In [12]:
print(df_boosting['Boosting accuracy'].argmax(),df_boosting['Boosting accuracy'].max())
98 0.7358085873547624
In [8]:
# # run boost model in the entire train set
best_boost_clf = AdaBoostClassifier(n_estimators=98, learning_rate=0.5)
clf = best_boost_clf.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test,Y_test))
Test Set Accuracy: 0.727318803852002
In [9]:
# 
import pickle
pickle.dump(best_boost_clf, open('best_boost_clf.sav', 'wb'))
pickle.dump(X, open('X_train.sav', 'wb'))
pickle.dump(X_test, open('X_test.sav', 'wb'))
pickle.dump(Y_test, open('Y_test.sav', 'wb'))
In [ ]: