# Project 2 Decision Support System

# MSMA 12B
# Team: Shulian Guan, Ziting Liao, Xiaoran Li, Yiwei Zhang, Zheng Lian

!pwd

C:\Users\DELL\Desktop\Rochester\python group final

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Data=pd.read_csv("heloc_dataset_v1.csv")

# 1. explore the whole dataset
Data.head(10)

Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10459 entries, 0 to 10458
Data columns (total 24 columns):
RiskPerformance                       10459 non-null object
ExternalRiskEstimate                  10459 non-null int64
MSinceOldestTradeOpen                 10459 non-null int64
MSinceMostRecentTradeOpen             10459 non-null int64
AverageMInFile                        10459 non-null int64
NumSatisfactoryTrades                 10459 non-null int64
NumTrades60Ever2DerogPubRec           10459 non-null int64
NumTrades90Ever2DerogPubRec           10459 non-null int64
PercentTradesNeverDelq                10459 non-null int64
MSinceMostRecentDelq                  10459 non-null int64
MaxDelq2PublicRecLast12M              10459 non-null int64
MaxDelqEver                           10459 non-null int64
NumTotalTrades                        10459 non-null int64
NumTradesOpeninLast12M                10459 non-null int64
PercentInstallTrades                  10459 non-null int64
MSinceMostRecentInqexcl7days          10459 non-null int64
NumInqLast6M                          10459 non-null int64
NumInqLast6Mexcl7days                 10459 non-null int64
NetFractionRevolvingBurden            10459 non-null int64
NetFractionInstallBurden              10459 non-null int64
NumRevolvingTradesWBalance            10459 non-null int64
NumInstallTradesWBalance              10459 non-null int64
NumBank2NatlTradesWHighUtilization    10459 non-null int64
PercentTradesWBalance                 10459 non-null int64
dtypes: int64(23), object(1)
memory usage: 1.9+ MB

Data.describe()

Data['RiskPerformance'].value_counts()

Bad     5459
Good    5000
Name: RiskPerformance, dtype: int64

# 2. data cleaning

# deal with missing value
for col in Data.columns:
    print(sum(Data[col]==-7),sum(Data[col]==-8), sum(Data[col]==-9), col)

0 0 0 RiskPerformance
0 0 598 ExternalRiskEstimate
0 239 588 MSinceOldestTradeOpen
0 0 588 MSinceMostRecentTradeOpen
0 0 588 AverageMInFile
0 0 588 NumSatisfactoryTrades
0 0 588 NumTrades60Ever2DerogPubRec
0 0 588 NumTrades90Ever2DerogPubRec
0 0 588 PercentTradesNeverDelq
4664 176 588 MSinceMostRecentDelq
0 0 588 MaxDelq2PublicRecLast12M
0 0 588 MaxDelqEver
0 0 588 NumTotalTrades
0 0 588 NumTradesOpeninLast12M
0 0 588 PercentInstallTrades
1855 476 588 MSinceMostRecentInqexcl7days
0 0 588 NumInqLast6M
0 0 588 NumInqLast6Mexcl7days
0 186 588 NetFractionRevolvingBurden
0 3419 588 NetFractionInstallBurden
0 156 588 NumRevolvingTradesWBalance
0 861 588 NumInstallTradesWBalance
0 583 588 NumBank2NatlTradesWHighUtilization
0 18 588 PercentTradesWBalance

# drop rows with all '-9'
Data = Data.replace(-9,np.nan)
Data = Data.dropna(axis=0)

Data.shape

(9861, 24)

# deal with categorical variables
Data = pd.get_dummies(Data, columns=['MaxDelq2PublicRecLast12M'], drop_first=False)
Data = pd.get_dummies(Data, columns=['MaxDelqEver'], drop_first=False)

# convert the target variable to boolean
Data['RiskPerformance'] = Data['RiskPerformance'].replace("Bad",0)
Data['RiskPerformance'] = Data['RiskPerformance'].replace("Good",1)

Data.head(5)

for col in Data.columns: 
    print(col)

RiskPerformance
ExternalRiskEstimate
MSinceOldestTradeOpen
MSinceMostRecentTradeOpen
AverageMInFile
NumSatisfactoryTrades
NumTrades60Ever2DerogPubRec
NumTrades90Ever2DerogPubRec
PercentTradesNeverDelq
MSinceMostRecentDelq
MaxDelq2PublicRecLast12M
MaxDelqEver
NumTotalTrades
NumTradesOpeninLast12M
PercentInstallTrades
MSinceMostRecentInqexcl7days
NumInqLast6M
NumInqLast6Mexcl7days
NetFractionRevolvingBurden
NetFractionInstallBurden
NumRevolvingTradesWBalance
NumInstallTradesWBalance
NumBank2NatlTradesWHighUtilization
PercentTradesWBalance

Data.iloc[:,19:25].describe()

for col in Data.columns:
    print(sum(Data[col]==-7),sum(Data[col]==-8), sum(Data[col]==-9), col)

0 0 0 RiskPerformance
0 0 0 ExternalRiskEstimate
0 239 0 MSinceOldestTradeOpen
0 0 0 MSinceMostRecentTradeOpen
0 0 0 AverageMInFile
0 0 0 NumSatisfactoryTrades
0 0 0 NumTrades60Ever2DerogPubRec
0 0 0 NumTrades90Ever2DerogPubRec
0 0 0 PercentTradesNeverDelq
4658 176 0 MSinceMostRecentDelq
0 0 0 NumTotalTrades
0 0 0 NumTradesOpeninLast12M
0 0 0 PercentInstallTrades
1853 476 0 MSinceMostRecentInqexcl7days
0 0 0 NumInqLast6M
0 0 0 NumInqLast6Mexcl7days
0 179 0 NetFractionRevolvingBurden
0 3412 0 NetFractionInstallBurden
0 149 0 NumRevolvingTradesWBalance
0 854 0 NumInstallTradesWBalance
0 576 0 NumBank2NatlTradesWHighUtilization
0 11 0 PercentTradesWBalance
0 0 0 MaxDelq2PublicRecLast12M_0.0
0 0 0 MaxDelq2PublicRecLast12M_1.0
0 0 0 MaxDelq2PublicRecLast12M_2.0
0 0 0 MaxDelq2PublicRecLast12M_3.0
0 0 0 MaxDelq2PublicRecLast12M_4.0
0 0 0 MaxDelq2PublicRecLast12M_5.0
0 0 0 MaxDelq2PublicRecLast12M_6.0
0 0 0 MaxDelq2PublicRecLast12M_7.0
0 0 0 MaxDelq2PublicRecLast12M_9.0
0 0 0 MaxDelqEver_2.0
0 0 0 MaxDelqEver_3.0
0 0 0 MaxDelqEver_4.0
0 0 0 MaxDelqEver_5.0
0 0 0 MaxDelqEver_6.0
0 0 0 MaxDelqEver_7.0
0 0 0 MaxDelqEver_8.0

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from IPython.display import Image 
from IPython.display import IFrame 
import warnings
warnings.filterwarnings("ignore")

# 3. Create train set, test set

train_set, test_set = train_test_split(Data, test_size=0.2, random_state=1)

train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

train_set.to_csv('TrainData.csv',index=False)
test_set.to_csv('TestData.csv',index=False)

print(train_set.shape, test_set.shape)

X = train_set.copy().drop("RiskPerformance", axis=1)
Y = train_set["RiskPerformance"].copy()
X_test = test_set.copy().drop("RiskPerformance", axis=1)
Y_test = test_set["RiskPerformance"].copy()

(7888, 38) (1973, 38)

# 4. Run models

# 4.1 Logistic Regression

lin_reg = LogisticRegression()
scores = cross_val_score(lin_reg, X, Y, cv=10)
print("Train Set Accuracy:", scores.mean())
clf = lin_reg.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test, Y_test))

# 4.2 Decision Tree Classifier

param_grid = [{'max_depth':[3,5,7,9,11],'max_features':[2,4,6,8,10,12]}]
tree_clf = DecisionTreeClassifier(random_state=1)
grid_search = GridSearchCV(tree_clf, param_grid, cv=5)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_tree_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_tree_clf.score(X_test,Y_test))

{'max_depth': 5, 'max_features': 12}
Train Set Accuracy: 0.7137423935091278
Test Set Accuracy: 0.7045108971109985

# 4.3 Random Forest Classifier

param_grid = [{'n_estimators':[70,80,90,100],'max_features':[2,4,6,8,10]}]
rf_clf = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(rf_clf, param_grid, cv=4)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_rf_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_rf_clf.score(X_test,Y_test))

{'max_features': 4, 'n_estimators': 90}
Train Set Accuracy: 0.7363083164300203
Test Set Accuracy: 0.7131272174353775

#Feature importance
clf_rf = RandomForestClassifier(max_features="sqrt", n_estimators=50, max_depth=1)
clf_rf = clf_rf.fit(X,Y)
# plot feature importance
pd.Series(data=clf_rf.feature_importances_, index=X.columns).sort_values().plot.bar()

# 4.4 KNN

param_grid = [{'n_neighbors':[5,10,20,25,30,40]}]
knn_clf = neighbors.KNeighborsClassifier()  
grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_knn_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_knn_clf.score(X_test,Y_test))

{'n_neighbors': 30}
Train Set Accuracy: 0.7058823529411765
Test Set Accuracy: 0.6882919412062849

# 4.5 Linear Discriminant Analysis

param_grid = [{'n_components':[2,4,6,8,10]}]
lda_clf = LinearDiscriminantAnalysis()  
grid_search = GridSearchCV(lda_clf, param_grid, cv=5)
grid_search.fit(X,Y)

print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_lda_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_lda_clf.score(X_test,Y_test))

{'n_components': 2}
Train Set Accuracy: 0.7325050709939148
Test Set Accuracy: 0.7192093258996453

# 4.6 BaggingClassifier based on decision tree classifier

tree_depth = 5
base_clf = tree.DecisionTreeClassifier(max_depth=tree_depth) # base classifier
results = []
n_range = range(1,100,1)
for n in n_range:
    clf_bagging = BaggingClassifier(n_estimators=n, base_estimator=base_clf)
    scores = cross_val_score(clf_bagging, X, Y, cv=5)

    results.append((n,scores.mean(), scores.std()))
    
df_bagging = pd.DataFrame(data=results,columns=['n','Bagging accuracy','Bagging error'])
df_bagging

print(df_bagging['Bagging accuracy'].argmax()+1,df_bagging['Bagging accuracy'].max())

43 0.7325110263871277

# run bagging model in the entire train set
best_bag_clf = BaggingClassifier(n_estimators=59, base_estimator=base_clf)
clf = best_bag_clf.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test,Y_test))

Test Set Accuracy: 0.7181956411556006

# 4.7 AdaBoostClassifier

res_boosting_mean = []
res_boosting_std = []
n_range = range(1,100,1)
for n in n_range:
    clf_boosting = AdaBoostClassifier(n_estimators=n, learning_rate=0.5)
    clf_boosting_scores = cross_val_score(clf_boosting, X, Y, cv=5)
    res_boosting_mean.append(clf_boosting_scores.mean())
    res_boosting_std.append(clf_boosting_scores.std())
df_boosting = pd.DataFrame({'Boosting accuracy':res_boosting_mean,'Boosting error':res_boosting_std},index=n_range)
df_boosting

df_boosting["Boosting accuracy"].plot(title = 'Boosting accuracy', x = 'n_estimators', y = 'accuracy')

<matplotlib.axes._subplots.AxesSubplot at 0x7f1c17b502b0>

print(df_boosting['Boosting accuracy'].argmax(),df_boosting['Boosting accuracy'].max())

98 0.7358085873547624

# # run boost model in the entire train set
best_boost_clf = AdaBoostClassifier(n_estimators=98, learning_rate=0.5)
clf = best_boost_clf.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test,Y_test))

Test Set Accuracy: 0.727318803852002

# 
import pickle
pickle.dump(best_boost_clf, open('best_boost_clf.sav', 'wb'))
pickle.dump(X, open('X_train.sav', 'wb'))
pickle.dump(X_test, open('X_test.sav', 'wb'))
pickle.dump(Y_test, open('Y_test.sav', 'wb'))

	ExternalRiskEstimate	MSinceOldestTradeOpen	MSinceMostRecentTradeOpen	AverageMInFile	NumSatisfactoryTrades	NumTrades60Ever2DerogPubRec	NumTrades90Ever2DerogPubRec	PercentTradesNeverDelq	MSinceMostRecentDelq	MaxDelq2PublicRecLast12M	...	PercentInstallTrades	MSinceMostRecentInqexcl7days	NumInqLast6M	NumInqLast6Mexcl7days	NetFractionRevolvingBurden	NetFractionInstallBurden	NumRevolvingTradesWBalance	NumInstallTradesWBalance	NumBank2NatlTradesWHighUtilization	PercentTradesWBalance
count	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	...	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000	10459.000000
mean	67.425758	184.205373	8.543455	73.843293	19.428052	0.042738	-0.142843	86.661536	6.762406	4.928291	...	32.166460	-0.325366	0.868152	0.812602	31.629888	39.158906	3.185008	0.976097	0.018071	62.079166
std	21.121621	109.683816	13.301745	38.782803	13.004327	2.513910	2.367397	25.999584	20.501250	3.756275	...	20.128634	6.067556	3.179304	3.143698	30.060140	42.101601	4.413173	4.060995	3.358135	27.711565
min	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	...	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000	-9.000000
25%	63.000000	118.000000	3.000000	52.000000	12.000000	0.000000	0.000000	87.000000	-7.000000	4.000000	...	20.000000	-7.000000	0.000000	0.000000	5.000000	-8.000000	2.000000	1.000000	0.000000	47.000000
50%	71.000000	178.000000	5.000000	74.000000	19.000000	0.000000	0.000000	96.000000	-7.000000	6.000000	...	31.000000	0.000000	1.000000	1.000000	25.000000	47.000000	3.000000	2.000000	0.000000	67.000000
75%	79.000000	249.500000	11.000000	95.000000	27.000000	1.000000	0.000000	100.000000	14.000000	7.000000	...	44.000000	1.000000	2.000000	2.000000	54.000000	79.000000	5.000000	3.000000	1.000000	82.000000
max	94.000000	803.000000	383.000000	383.000000	79.000000	19.000000	19.000000	100.000000	83.000000	9.000000	...	100.000000	24.000000	66.000000	66.000000	232.000000	471.000000	32.000000	23.000000	18.000000	100.000000

	NumInstallTradesWBalance	NumBank2NatlTradesWHighUtilization	PercentTradesWBalance	MaxDelq2PublicRecLast12M_0.0	MaxDelq2PublicRecLast12M_1.0	MaxDelq2PublicRecLast12M_2.0
count	9861.000000	9861.000000	9861.000000	9861.000000	9861.000000	9861.000000
mean	1.577021	0.560998	66.371768	0.033871	0.005476	0.005273
std	3.337305	2.601734	22.162833	0.180906	0.073802	0.072429
min	-8.000000	-8.000000	-8.000000	0.000000	0.000000	0.000000
25%	1.000000	0.000000	50.000000	0.000000	0.000000	0.000000
50%	2.000000	1.000000	67.000000	0.000000	0.000000	0.000000
75%	3.000000	1.000000	83.000000	0.000000	0.000000	0.000000
max	23.000000	18.000000	100.000000	1.000000	1.000000	1.000000

	n	Bagging accuracy	Bagging error
0	1	0.706138	0.008542
1	2	0.712862	0.016594
2	3	0.721227	0.009569
3	4	0.725159	0.013392
4	5	0.727946	0.011011
5	6	0.727567	0.014109
6	7	0.730737	0.014440
7	8	0.727314	0.015260
8	9	0.726299	0.012478
9	10	0.726679	0.012309
10	11	0.729723	0.014433
11	12	0.726679	0.014091
12	13	0.727945	0.008753
13	14	0.727438	0.009028
14	15	0.728327	0.011403
15	16	0.729468	0.011711
16	17	0.726553	0.013607
17	18	0.730103	0.013419
18	19	0.728707	0.011922
19	20	0.724904	0.012932
20	21	0.732384	0.012878
21	22	0.730864	0.014698
22	23	0.730482	0.012465
23	24	0.730735	0.012298
24	25	0.728706	0.010538
25	26	0.730989	0.011192
26	27	0.730864	0.014951
27	28	0.731750	0.012530
28	29	0.729847	0.010302
29	30	0.727566	0.010858
...	...	...	...
69	70	0.729596	0.014817
70	71	0.731117	0.012680
71	72	0.729976	0.014267
72	73	0.731116	0.011510
73	74	0.729469	0.014071
74	75	0.727821	0.013465
75	76	0.729469	0.013513
76	77	0.730483	0.013305
77	78	0.729089	0.014895
78	79	0.728201	0.012370
79	80	0.729087	0.010587
80	81	0.729723	0.014468
81	82	0.730229	0.012803
82	83	0.729595	0.013250
83	84	0.728074	0.013100
84	85	0.730737	0.014002
85	86	0.730229	0.012098
86	87	0.729723	0.014824
87	88	0.730610	0.013748
88	89	0.730229	0.011379
89	90	0.727440	0.012204
90	91	0.730610	0.013784
91	92	0.731878	0.014701
92	93	0.729214	0.012053
93	94	0.730102	0.013185
94	95	0.731496	0.011933
95	96	0.728455	0.014197
96	97	0.729343	0.015047
97	98	0.729469	0.013431
98	99	0.729089	0.014867

	Boosting accuracy	Boosting error
1	0.705507	0.011043
2	0.705507	0.011043
3	0.705507	0.011043
4	0.705507	0.011043
5	0.715271	0.015084
6	0.712735	0.015279
7	0.714764	0.015588
8	0.716793	0.016472
9	0.718947	0.014641
10	0.723131	0.014620
11	0.725286	0.014544
12	0.726302	0.018030
13	0.727441	0.015887
14	0.726555	0.017896
15	0.726935	0.016728
16	0.728456	0.016254
17	0.726680	0.014817
18	0.727694	0.014758
19	0.726554	0.015898
20	0.728076	0.016556
21	0.728075	0.015710
22	0.727695	0.015446
23	0.728328	0.013742
24	0.727694	0.014041
25	0.728709	0.014306
26	0.729597	0.015463
27	0.729850	0.015942
28	0.729470	0.015229
29	0.729851	0.015572
30	0.730611	0.015428
...	...	...
70	0.730992	0.016407
71	0.732133	0.016564
72	0.732514	0.017077
73	0.732641	0.017595
74	0.732767	0.016195
75	0.733147	0.016999
76	0.731753	0.017338
77	0.730739	0.016760
78	0.731753	0.016916
79	0.732006	0.016530
80	0.732133	0.015871
81	0.732006	0.016434
82	0.732133	0.016780
83	0.732513	0.016441
84	0.733147	0.015991
85	0.733274	0.016180
86	0.732894	0.016171
87	0.733907	0.015547
88	0.734034	0.015777
89	0.734161	0.015584
90	0.734034	0.015823
91	0.733654	0.016016
92	0.734415	0.016461
93	0.734668	0.016363
94	0.734922	0.016488
95	0.734162	0.016817
96	0.734922	0.016280
97	0.735302	0.016207
98	0.735809	0.014774
99	0.735556	0.015878

	RiskPerformance	ExternalRiskEstimate	MSinceOldestTradeOpen	MSinceMostRecentTradeOpen	AverageMInFile	NumSatisfactoryTrades	NumTrades60Ever2DerogPubRec	NumTrades90Ever2DerogPubRec	PercentTradesNeverDelq	MSinceMostRecentDelq	...	PercentInstallTrades	NumInqLast6M	NumInqLast6Mexcl7days	NetFractionRevolvingBurden	NetFractionInstallBurden	NumRevolvingTradesWBalance	NumInstallTradesWBalance	NumBank2NatlTradesWHighUtilization	PercentTradesWBalance
0	Bad	55	144	4	84	20	3	0	83	2	...	43	0	0	33	-8	8	1	1	69
1	Bad	61	58	15	41	2	4	4	100	-7	...	67	0	0	0	-8	0	-8	-8	0
2	Bad	67	66	5	24	9	0	0	100	-7	...	44	4	4	53	66	4	2	1	86
3	Bad	66	169	1	73	28	1	1	93	76	...	57	5	4	72	83	6	4	3	91
4	Bad	81	333	27	132	12	0	0	100	-7	...	25	1	1	51	89	3	1	0	80
5	Bad	59	137	11	78	31	0	0	91	1	...	47	0	0	62	93	12	4	3	94
6	Good	54	88	7	37	25	0	0	92	9	...	58	4	4	89	76	7	7	2	100
7	Good	68	148	7	65	17	0	0	83	31	...	44	0	0	28	48	2	2	2	40
8	Bad	59	324	2	138	24	0	0	85	5	...	26	1	1	68	-8	7	1	3	90
9	Bad	61	79	4	36	19	0	0	95	5	...	26	6	6	31	86	5	3	1	62

	ExternalRiskEstimate	MSinceOldestTradeOpen	MSinceMostRecentTradeOpen	AverageMInFile	NumSatisfactoryTrades	NumTrades60Ever2DerogPubRec	NumTrades90Ever2DerogPubRec	PercentTradesNeverDelq	MSinceMostRecentDelq	...	MaxDelq2PublicRecLast12M_6.0	MaxDelq2PublicRecLast12M_7.0	MaxDelqEver_5.0	MaxDelqEver_6.0	MaxDelqEver_8.0
0	55.0	144.0	4.0	84.0	20.0	3.0	0.0	83.0	2.0	...	0	0	1	0	0
1	61.0	58.0	15.0	41.0	2.0	4.0	4.0	100.0	-7.0	...	0	0	0	0	1
2	67.0	66.0	5.0	24.0	9.0	0.0	0.0	100.0	-7.0	...	0	1	0	0	1
3	66.0	169.0	1.0	73.0	28.0	1.0	1.0	93.0	76.0	...	1	0	0	1	0
4	81.0	333.0	27.0	132.0	12.0	0.0	0.0	100.0	-7.0	...	0	1	0	0	1