# Project 2 Decision Support System
# MSMA 12B
# Team: Shulian Guan, Ziting Liao, Xiaoran Li, Yiwei Zhang, Zheng Lian
!pwd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Data=pd.read_csv("heloc_dataset_v1.csv")
# 1. explore the whole dataset
Data.head(10)
Data.info()
Data.describe()
Data['RiskPerformance'].value_counts()
# 2. data cleaning
# deal with missing value
for col in Data.columns:
print(sum(Data[col]==-7),sum(Data[col]==-8), sum(Data[col]==-9), col)
# drop rows with all '-9'
Data = Data.replace(-9,np.nan)
Data = Data.dropna(axis=0)
Data.shape
# deal with categorical variables
Data = pd.get_dummies(Data, columns=['MaxDelq2PublicRecLast12M'], drop_first=False)
Data = pd.get_dummies(Data, columns=['MaxDelqEver'], drop_first=False)
# convert the target variable to boolean
Data['RiskPerformance'] = Data['RiskPerformance'].replace("Bad",0)
Data['RiskPerformance'] = Data['RiskPerformance'].replace("Good",1)
Data.head(5)
for col in Data.columns:
print(col)
Data.iloc[:,19:25].describe()
for col in Data.columns:
print(sum(Data[col]==-7),sum(Data[col]==-8), sum(Data[col]==-9), col)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from IPython.display import Image
from IPython.display import IFrame
import warnings
warnings.filterwarnings("ignore")
# 3. Create train set, test set
train_set, test_set = train_test_split(Data, test_size=0.2, random_state=1)
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)
train_set.to_csv('TrainData.csv',index=False)
test_set.to_csv('TestData.csv',index=False)
print(train_set.shape, test_set.shape)
X = train_set.copy().drop("RiskPerformance", axis=1)
Y = train_set["RiskPerformance"].copy()
X_test = test_set.copy().drop("RiskPerformance", axis=1)
Y_test = test_set["RiskPerformance"].copy()
# 4. Run models
# 4.1 Logistic Regression
lin_reg = LogisticRegression()
scores = cross_val_score(lin_reg, X, Y, cv=10)
print("Train Set Accuracy:", scores.mean())
clf = lin_reg.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test, Y_test))
# 4.2 Decision Tree Classifier
param_grid = [{'max_depth':[3,5,7,9,11],'max_features':[2,4,6,8,10,12]}]
tree_clf = DecisionTreeClassifier(random_state=1)
grid_search = GridSearchCV(tree_clf, param_grid, cv=5)
grid_search.fit(X,Y)
print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_tree_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_tree_clf.score(X_test,Y_test))
# 4.3 Random Forest Classifier
param_grid = [{'n_estimators':[70,80,90,100],'max_features':[2,4,6,8,10]}]
rf_clf = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(rf_clf, param_grid, cv=4)
grid_search.fit(X,Y)
print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_rf_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_rf_clf.score(X_test,Y_test))
#Feature importance
clf_rf = RandomForestClassifier(max_features="sqrt", n_estimators=50, max_depth=1)
clf_rf = clf_rf.fit(X,Y)
# plot feature importance
pd.Series(data=clf_rf.feature_importances_, index=X.columns).sort_values().plot.bar()
# 4.4 KNN
param_grid = [{'n_neighbors':[5,10,20,25,30,40]}]
knn_clf = neighbors.KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
grid_search.fit(X,Y)
print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_knn_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_knn_clf.score(X_test,Y_test))
# 4.5 Linear Discriminant Analysis
param_grid = [{'n_components':[2,4,6,8,10]}]
lda_clf = LinearDiscriminantAnalysis()
grid_search = GridSearchCV(lda_clf, param_grid, cv=5)
grid_search.fit(X,Y)
print(grid_search.best_params_)
print("Train Set Accuracy:", grid_search.best_score_)
best_lda_clf = grid_search.best_estimator_
print("Test Set Accuracy:",best_lda_clf.score(X_test,Y_test))
# 4.6 BaggingClassifier based on decision tree classifier
tree_depth = 5
base_clf = tree.DecisionTreeClassifier(max_depth=tree_depth) # base classifier
results = []
n_range = range(1,100,1)
for n in n_range:
clf_bagging = BaggingClassifier(n_estimators=n, base_estimator=base_clf)
scores = cross_val_score(clf_bagging, X, Y, cv=5)
results.append((n,scores.mean(), scores.std()))
df_bagging = pd.DataFrame(data=results,columns=['n','Bagging accuracy','Bagging error'])
df_bagging
print(df_bagging['Bagging accuracy'].argmax()+1,df_bagging['Bagging accuracy'].max())
# run bagging model in the entire train set
best_bag_clf = BaggingClassifier(n_estimators=59, base_estimator=base_clf)
clf = best_bag_clf.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test,Y_test))
# 4.7 AdaBoostClassifier
res_boosting_mean = []
res_boosting_std = []
n_range = range(1,100,1)
for n in n_range:
clf_boosting = AdaBoostClassifier(n_estimators=n, learning_rate=0.5)
clf_boosting_scores = cross_val_score(clf_boosting, X, Y, cv=5)
res_boosting_mean.append(clf_boosting_scores.mean())
res_boosting_std.append(clf_boosting_scores.std())
df_boosting = pd.DataFrame({'Boosting accuracy':res_boosting_mean,'Boosting error':res_boosting_std},index=n_range)
df_boosting
df_boosting["Boosting accuracy"].plot(title = 'Boosting accuracy', x = 'n_estimators', y = 'accuracy')
print(df_boosting['Boosting accuracy'].argmax(),df_boosting['Boosting accuracy'].max())
# # run boost model in the entire train set
best_boost_clf = AdaBoostClassifier(n_estimators=98, learning_rate=0.5)
clf = best_boost_clf.fit(X, Y)
print("Test Set Accuracy:",clf.score(X_test,Y_test))
#
import pickle
pickle.dump(best_boost_clf, open('best_boost_clf.sav', 'wb'))
pickle.dump(X, open('X_train.sav', 'wb'))
pickle.dump(X_test, open('X_test.sav', 'wb'))
pickle.dump(Y_test, open('Y_test.sav', 'wb'))