# import packages
import pandas as pd
import numpy as np
# remove warnings so outputs look better when running cells
import warnings
#warnings.filterwarnings('ignore')
# read file, low_memory is set to False as required by a system message
df = pd.read_csv("LoansTrainingSet.csv", low_memory = False)
# rename columns for typing convenience
df.columns = ['loanid', 'custid', 'loan_status', 'loan_amount', 'term', 'credit_score', 'years_in_job', 'home_ownership', 'income', 'purpose', 'monthly_debt', 'years_credhistory', 'months_since_del', 'nr_accounts', 'nr_problems', 'credit_balance', 'max_credit', 'bankr', 'tax_liens']
We eliminate 16k duplicate rows from our dataset (these are exactly identical rows).For each loan ID, there should be ONLY ONE single row of data that we will use to predict the probability of default for that loan.
#eliminate identical duplicate rows
print df.shape #256k rows
df.drop_duplicates(inplace = True) #eliminate duplicate rows
print df.shape # 240k rows remain after exact duplicates are dropped
# loan_status is set to 0 and 1 instead of Charged Off and Fully Paid
z = {'Charged Off':0, 'Fully Paid':1, }
df.loan_status = df.loan_status.map(z)
# Eliminate Missing Values
df.dropna (subset = ['credit_score', 'income', 'bankr', 'tax_liens' ], inplace = True)
df = df.drop_duplicates(subset = 'loanid')
w = {'10+ years': 'Over 10 years', '< 1 year': 'Less than 1 Year','1 year':'1-5', '2 years': '1-5', '3 years':'1-5', '4 years': '1-5', '5 years': '1-5','6 years': '6-10', '7 years': '6-10', '8 years': '6-10', '9 years': '6-10','n/a': 'Not Applicable'}
df['years_in_job2'] = df.years_in_job.map(w)
df.loc[df.home_ownership == 'HaveMortgage', 'home_ownership'] = 'Home Mortgage'
df = df[df.loan_amount < 200000]
Correct credit scores over 800 (divide them by 10)
df.credit_score = df.credit_score.apply (lambda x: x/10 if x > 800 else x)
Drop 14 values over 1000000
df = df[df.income<1000000]
#dummies will be created later for this feature
df.bankr = df.bankr.astype(object)
df.monthly_debt = df.monthly_debt.str.replace ('$', '')
df.monthly_debt = df.monthly_debt.str.replace (',', '')
df.monthly_debt = df.monthly_debt.apply(lambda x: float(x) )
df = df[df.monthly_debt<=6000]
df.shape
df.info()
Xy = df.copy()
Xy.drop(['loanid', 'custid','years_credhistory', 'months_since_del', 'nr_accounts', 'nr_problems', 'credit_balance', 'max_credit'], axis = 1, inplace = True)
Xy.columns
# ADD CUSTID, LOANID??
## ADD CUSTID AND ENGINEER DEBT TO INCOME!!!!
## GREAT CODE FOR DEFININING VARIABLES THAT GO IN THE MODEL!!!!!
#X.columns
#X = combined [['']]
#input_features=df[['Years of Credit History','Monthly Debt']]
#to_predict1=df[['Annual Income']]
X=Xy.copy()
del X['loan_status']
y=Xy['loan_status']
X.columns
X=pd.get_dummies(X,drop_first=True)
X.head()
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = .2, random_state = 33)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
model = GradientBoostingClassifier()
rfe = RFE(model,3)
fitting = rfe.fit(X,y)
print("Num features: %d" % fit.n_features_)
model2 = GradientBoostingClassifier()
model2.fit(X,y)
print(model2.feature_importances_)
# Fit Models: Logistic Reg, Decision Trees, Random Forest, Gradient Boosting Classifier, AdaBoost Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
log = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient = GradientBoostingClassifier()
ada = AdaBoostClassifier()
### Optimize for recall instead of accuracy!!!
l = log.fit(X_train, y_train)
t = tree.fit(X_train, y_train)
f = forest.fit(X_train, y_train)
g = gradient.fit(X_train, y_train)
a = ada.fit(X_train, y_train)
print("The score for Logistic Regression is, ", l.score(X_test, y_test))
print("The score for Decision Trees is ", t.score(X_test, y_test))
print("The score for Random Forest is ", f.score(X_test, y_test))
print("The score for Gradient Boosting is ", g.score(X_test, y_test))
print("The score for AdaBoost is ", a.score(X_test, y_test))
from sklearn.metrics import recall_score,precision_score,f1_score
pred=g.predict(X_test)
print recall_score(y_test, pred)
print precision_score(y_test,pred)
print f1_score(y_test,pred)
from xgboost import XGBClassifier
from xgboost import plot_importance
#instantiate a model, create a param grid
model = XGBClassifier(max_depth = 5, learning_rate = .01, n_estimators = 2000, nthread = -1, min_child_weight = 2, subsample = .6, colsample_bylevel = .5, seed = 0 )
#TRY COLUMN SAMPLE BY TREE!!!
X_train.shape
X_train.columns = [u'loan_amount', u'credit_score', u'income', u'monthly_debt',
u'tax_liens', u'term_Short Term', u'years_in_job_10+ years',
u'years_in_job_2 years', u'years_in_job_3 years',
u'years_in_job_4 years', u'years_in_job_5 years',
u'years_in_job_6 years', u'years_in_job_7 years',
u'years_in_job_8 years', u'years_in_job_9 years',
u'years_in_job_Under 1 year', u'years_in_job_n/a',
u'home_ownership_Own Home', u'home_ownership_Rent',
u'purpose_Buy House', u'purpose_Buy a Car',
u'purpose_Debt Consolidation', u'purpose_Educational Expenses',
u'purpose_Home Improvements', u'purpose_Medical Bills',
u'purpose_Other', u'purpose_Take a Trip', u'purpose_other',
u'bankr_1.0', u'bankr_2.0', u'bankr_3.0', u'bankr_4.0', u'bankr_5.0',
u'bankr_6.0', u'years_in_job2_6-10', u'years_in_job2_Less than 1 Year',
u'years_in_job2_Not Applicable', u'years_in_job2_Over 10 years']
model.fit(X_train, y_train)
X_test.columns = [u'loan_amount', u'credit_score', u'income', u'monthly_debt',
u'tax_liens', u'term_Short Term', u'years_in_job_10+ years',
u'years_in_job_2 years', u'years_in_job_3 years',
u'years_in_job_4 years', u'years_in_job_5 years',
u'years_in_job_6 years', u'years_in_job_7 years',
u'years_in_job_8 years', u'years_in_job_9 years',
u'years_in_job_Under 1 year', u'years_in_job_n/a',
u'home_ownership_Own Home', u'home_ownership_Rent',
u'purpose_Buy House', u'purpose_Buy a Car',
u'purpose_Debt Consolidation', u'purpose_Educational Expenses',
u'purpose_Home Improvements', u'purpose_Medical Bills',
u'purpose_Other', u'purpose_Take a Trip', u'purpose_other',
u'bankr_1.0', u'bankr_2.0', u'bankr_3.0', u'bankr_4.0', u'bankr_5.0',
u'bankr_6.0', u'years_in_job2_6-10', u'years_in_job2_Less than 1 Year',
u'years_in_job2_Not Applicable', u'years_in_job2_Over 10 years']
print("The score for XGB is ", model.score(X_test, y_test))
f = g.feature_importances_
len(f)
feat_imp = pd.DataFrame(data = {'Feature Name': X.columns, 'Feature Importance': f},columns = ['Feature Name', 'Feature Importance'])
feat_imp.sort_values('Feature Importance', ascending = False, inplace = True)
feat_imp