# Core Packages
import pandas as pd
import numpy as np
import os
import random
import glob
import pickle
import zipfile
# Convert Time Features
from datetime import datetime as dt
# Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import HTML
from PIL import Image
from urllib.request import urlopen
import json
from jupyterthemes import jtplot
jtplot.style(theme='chesterish', grid=False)
# Imputing Data
from sklearn.impute import KNNImputer
# Splitting Data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Data Reduction
from sklearn.decomposition import PCA
# Multicollinearity
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
# Machine Learning Packages
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier, BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SVMSMOTE
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Neural Network
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization
from imblearn.keras import BalancedBatchGenerator
from keras.wrappers.scikit_learn import KerasClassifier
# Save Runtime
import time
# Model Selection and Hyperparameter Tuning
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
# Output Statistics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# Remove warnings
import warnings
warnings.filterwarnings('ignore')
I set up all data wrangling and data analysis as a series of functions, which will enable me to reuse on data from subsequent years (future projects) and various analysis techniques (this project).
Create empty data frames and arrays for function parameters called in this script
# Initiate function parameters
df = pd.DataFrame()
df_acq = pd.DataFrame()
Banks = pd.DataFrame()
X_train = pd.DataFrame()
X_val = pd.DataFrame()
X_test = pd.DataFrame()
banks = ''
bank_str = ''
y_train = np.array([])
y_val = np.array([])
y_test = np.array([])
1) The acquisition data includes one observation for each loan with each feature representing knowledge Fannie Mae has when acquiring the loan (e.g., balance, primary lender, credit score, etc.).
2) The performance data includes observations for each month each loan is held and information on the payment of the loan.
I use the acquisition data as predictors for a dichotomous categorization of whether the homeowner defaulted on their loan, a target variable I create using the performance data and merging onto the acquisition data.
Performance data is much larger as it is transaction based, while the acquistion data has the loan owner as its unit of analysis.
I retain only the most recent performance transaction relating to foreclosure, then drop all other variables except Loan ID (the primary key) and merge performance data onto acquisition data.
I recode performance data into dichotomous categorization of whether loan was foreclosed upon.
Federal Reserve Economic Data (FRED) includes macro economic data that is related to the housing market. This data is merged on the date variable (mm/yyyy); it includes monthly data and quarterly data, the latter used carryforward hard coding (each quarter represented the beginning of the quarter) to cover each month. Some FRED sets included four Census region subsets; these were merged on the date variable (mm/yyyy) and the property state variable, the latter was mapped to the four Census regions of Northeast, Midwest, South, and West. Values were converted to quarterly and yearly deltas (e.g., the change in housing vacancies from 2006 Q4 to 2007 Q1 or 2006 Q1 to 2007 Q1).
Federal Deposit Insurance Corporation (FDIC) data includes information on FDIC-backed banks, such as their number of employees, assets, debts, etc. I used regular expressions to map FDIC data to the Bank variable; this included summing various instances of the same bank (from a different branch or functional area). This data is merged on the bank variable and the date variable (mm/yyyy); it includes quarterly data, which used carrybackward hard coding (each quarter represented the end of the quarter) to cover each month. Values were converted to quarterly and yearly deltas (e.g., the change in Bank of America liabilities from 2006 Q4 to 2007 Q1 or 2006 Q1 to 2007 Q1).
Merge FRED on Monthly Data
Carryforward hard coding if data is quarterly
def merge_fred_on_month(df_mnth, merge_df = df, varname = '', quarter=False, pct_change=1):
# Split date var
df_mnth['Month'] = df_mnth['DATE'].apply(str).apply(lambda x: x.split('/')[0].strip()).apply(str)
df_mnth['Year'] = df_mnth['DATE'].apply(str).apply(lambda x: x.split('/')[1].strip()).apply(str)
df_mnth = df_mnth.drop(labels='DATE', axis=1)
# Retrieve name of main column
var = df_mnth.columns[0]
# Period change
df_mnth[var] = df_mnth[var].pct_change(pct_change)
# Ensure correct dtype
df_mnth[var] = df_mnth[var].astype(float)
# If quarterly data,
# Carry first month of quarter forward
if quarter:
for i in range(df_mnth.shape[0]):
if df_mnth.loc[i, 'Month']=='01':
new_row = df_mnth.iloc[i,:].replace({'Month': '01'}, '02')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '01'}, '03')
df_mnth = df_mnth.append(new_row)
elif df_mnth.loc[i, 'Month']=='04':
new_row = df_mnth.iloc[i,:].replace({'Month': '04'}, '05')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '04'}, '06')
df_mnth = df_mnth.append(new_row)
elif df_mnth.loc[i, 'Month']=='07':
new_row = df_mnth.iloc[i,:].replace({'Month': '07'}, '08')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '07'}, '09')
df_mnth = df_mnth.append(new_row)
elif df_mnth.loc[i, 'Month']=='10':
new_row = df_mnth.iloc[i,:].replace({'Month': '10'}, '11')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '10'}, '12')
df_mnth = df_mnth.append(new_row)
# Create merge var
df_mnth['Original Date'] = (df_mnth['Month'].map(str) + '/' + df_mnth['Year']).apply(str)
df_mnth = df_mnth.drop(labels=['Year', 'Month'], axis=1)
df_mnth = df_mnth.rename(columns={var: varname})
# Merge
merge_df = pd.merge(merge_df, df_mnth, on='Original Date', how='inner')
return merge_df
State to Region Conversion
def to_region(df_new, var, state_drop=False):
# Region-State crosswalk
Northeast = ['ME','VT','NH','MA','NY','RI','CT','PA','NJ']
South = ['DE','MD','DC','WV','VA','KY','NC','TN','SC','GA','FL','AL','MS','AR','LA','OK','TX']
Midwest = ['ND','SD','NE','KS','MN','IA','MO','WI','IL','MI','IN','OH']
West = ['WA','OR','ID','MT','WY','CA','NV','UT','AZ','CO','NM','AK','HI']
# Replace States with Census regions
df_new['Region'] = df_new[var]
df_new['Region'] = df_new['Region'].replace(Northeast, 'Northeast')
df_new['Region'] = df_new['Region'].replace(South, 'South')
df_new['Region'] = df_new['Region'].replace(Midwest, 'Midwest')
df_new['Region'] = df_new['Region'].replace(West, 'West')
# Drop State var
if state_drop:
df_new = df_new.drop(labels=var, axis=1)
return df_new
Merge regional data
def region_merge(NE, SO, MW, WE, varname, df_orig = df, quarter=False, pct_change=1):
# Northeast
Northeast = df_orig[df_orig['Region']=='Northeast']
Northeast = Northeast[['Loan ID', 'Region', 'Original Date']]
Northeast = merge_fred_on_month(df_mnth = NE, merge_df = Northeast,
varname=varname, quarter=quarter, pct_change=pct_change)
Northeast = Northeast.rename(columns={Northeast.columns[3]: varname})
# South
South = df_orig[df_orig['Region']=='South']
South = South[['Loan ID', 'Region', 'Original Date']]
South = merge_fred_on_month(df_mnth = SO, merge_df = South,
varname=varname, quarter=quarter, pct_change=pct_change)
South = South.rename(columns={South.columns[3]: varname})
# Midwest
Midwest = df_orig[df_orig['Region']=='Midwest']
Midwest = Midwest[['Loan ID', 'Region', 'Original Date']]
Midwest = merge_fred_on_month(df_mnth = MW, merge_df = Midwest,
varname=varname, quarter=quarter, pct_change=pct_change)
Midwest = Midwest.rename(columns={Midwest.columns[3]: varname})
# West
West = df_orig[df_orig['Region']=='West']
West = West[['Loan ID', 'Region', 'Original Date']]
West = merge_fred_on_month(df_mnth = WE, merge_df = West,
varname=varname, quarter=quarter, pct_change=pct_change)
West = West.rename(columns={West.columns[3]: varname})
# Stack
df_region = pd.concat([Northeast, South, Midwest, West])
df_region = df_region[['Loan ID', varname]]
# Merge
df_new = pd.merge(df_orig, df_region, on='Loan ID', how='inner')
return df_new
FRED Data merge wrapper (full US)
def fred_merge(fred_df, df_orig = df, quarter=True, varname = ''):
# Define units within year
if quarter:
pct_qtr = 1
pct_year = 4
else:
pct_qtr = 4
pct_year = 12
# Define variable name, if not set
if varname=='':
varname = str(fred_df)
# convert datetime
fred_df['DATE'] = pd.to_datetime(fred_df['DATE']).dt.strftime('%m/%Y').apply(str)
# merge FRED data and convert to percent change
df_new = merge_fred_on_month(fred_df, df_orig, varname, quarter=quarter, pct_change=pct_qtr)
df_new = df_new.rename(columns={varname: str(varname + ' (Qtr)')})
df_new = merge_fred_on_month(fred_df, df_new, varname, quarter=quarter, pct_change=pct_year)
df_new = df_new.rename(columns={varname: str(varname + ' (Yr)')})
return df_new
FRED merge wrapper (region)
def fred_merge_region(NE, SO, MW, WE, df_orig = df, varname = '', quarter=True):
# Define units within year
if quarter:
pct_qtr = 1
pct_year = 4
else:
pct_qtr = 4
pct_year = 12
# Define variable name, if not set
if varname=='':
varname = str(fred_df)
# convert datetime
NE['DATE'] = pd.to_datetime(NE['DATE']).dt.strftime('%m/%Y').apply(str)
SO['DATE'] = pd.to_datetime(SO['DATE']).dt.strftime('%m/%Y').apply(str)
MW['DATE'] = pd.to_datetime(MW['DATE']).dt.strftime('%m/%Y').apply(str)
WE['DATE'] = pd.to_datetime(WE['DATE']).dt.strftime('%m/%Y').apply(str)
# merge FRED data and convert to percent change
df_new = region_merge(NE=NE, SO=SO, MW=MW, WE=WE, df_orig = df_orig,
varname=varname, quarter=quarter, pct_change=pct_qtr)
df_new = df_new.rename(columns={varname: str(varname + ' (Qtr)')})
df_new = region_merge(NE=NE, SO=SO, MW=MW, WE=WE, df_orig = df_new,
varname=varname, quarter=quarter, pct_change=pct_year)
df_new = df_new.rename(columns={varname: str(varname + ' (Yr)')})
return df_new
Standardize bank names of FDIC data to Fannie Mae data
def grep_bank_groupings(df, Bank = 'Bank'):
# Map similar names to bank
## Group Bank of America
BoA = df[Bank].str.contains('Bank of America', case = False)
df.loc[BoA, 'Bank'] = 'Bank of America'
## Group Citi Mortgage
Citi = df[Bank].str.contains('Citibank|Citicorp|CitiMortgage', case = False)
df.loc[Citi, 'Bank'] = 'CitiMortgage'
## Group GMac
GMac = df[Bank].str.contains('GMAC', case = False)
df.loc[GMac, 'Bank'] = 'GMAC Mortgage'
## Group PNC
PNC = df[Bank].str.contains('PNC Bank', case = False)
df.loc[PNC, 'Bank'] = 'PNC Bank'
## Group SunTrust
SunTrust = df[Bank].str.contains('SunTrust', case = False)
df.loc[SunTrust, 'Bank'] = 'SunTrust Mortgage'
## Group AmTrust
AmTrust = df[Bank].str.contains('AmTrust', case = False)
df.loc[AmTrust, 'Bank'] = 'AmTrust Bank'
## Group Flagstar
Flagstar = df[Bank].str.contains('Flagstar', case = False)
df.loc[Flagstar, 'Bank'] = 'Flagstar Bank'
## Group Chase
Chase = df[Bank].str.contains('Chase|JP Morgan|J. P. Morgan|JPMorgan', case = False)
df.loc[Chase, 'Bank'] = 'JPMorgan Chase'
## Group Wells Fargo
Wells = df[Bank].str.contains('Wells Fargo', case = False)
df.loc[Wells, 'Bank'] = 'Wells Fargo Bank'
return df
Convert FDIC Data to Monthly Data
Carrybackwards hard coding to convert quarterly data to monthly
def fdic_on_month(df_mnth):
# Retrieve names of columns
cols = df_mnth.columns
# convert datetime
df_mnth['repdte'] = pd.to_datetime(df_mnth['repdte']).dt.strftime('%m/%Y').apply(str)
# Split date var
df_mnth['Month'] = df_mnth['repdte'].apply(str).apply(lambda x: x.split('/')[0].strip()).apply(str)
df_mnth['Year'] = df_mnth['repdte'].apply(str).apply(lambda x: x.split('/')[1].strip()).apply(str)
# Carry first month of quarter forward
for i in range(df_mnth.shape[0]):
if df_mnth.loc[i, 'Month']=='03':
new_row = df_mnth.iloc[i,:].replace({'Month': '03'}, '01')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '03'}, '02')
df_mnth = df_mnth.append(new_row)
elif df_mnth.loc[i, 'Month']=='06':
new_row = df_mnth.iloc[i,:].replace({'Month': '06'}, '04')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '06'}, '05')
df_mnth = df_mnth.append(new_row)
elif df_mnth.loc[i, 'Month']=='09':
new_row = df_mnth.iloc[i,:].replace({'Month': '09'}, '07')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '09'}, '08')
df_mnth = df_mnth.append(new_row)
elif df_mnth.loc[i, 'Month']=='12':
new_row = df_mnth.iloc[i,:].replace({'Month': '12'}, '10')
df_mnth = df_mnth.append(new_row)
new_row = df_mnth.iloc[i,:].replace({'Month': '12'}, '11')
df_mnth = df_mnth.append(new_row)
# Create merge var
df_mnth['Original Date'] = (df_mnth['Month'].map(str) + '/' + df_mnth['Year']).apply(str)
df_mnth = df_mnth.drop(labels=['Month', 'Year', 'repdte'], axis=1)
return df_mnth
Removal of features with really high missingness or no data variation, and then mean/mode hard coding on features with low missingness
def missing_treat(df):
# Find features with 10% missing or more
condition = ( df.isnull().sum(axis=0)/df.shape[0]*100 )
df_HighMissing = condition > 10
# Save features that contain missing data
df_HighMissing = df_HighMissing.index[df_HighMissing.values == True]
# remove high missing features
df = df.drop(labels=df_HighMissing, axis=1)
# impute on the mean for low missing features that are continuous
df_cont = df.select_dtypes(include=['float64', 'int64'])
df[df_cont.columns] = df_cont.apply(lambda x: x.fillna(x.mean()),axis=0)
# impute on the mode for low missing features that are categorical
df_cat = df.select_dtypes(include=['object'])
df[df_cat.columns] = df_cat.apply(lambda x: x.fillna(x.mode()),axis=0)
return df
Impute using KNN
def KNN_imputations(df_X, df_y, X_cols, n_neighbors=3):
# Subset columns and add target variable
df_imp = df_X[X_cols]
df_imp = pd.concat([df_X, df_y], axis=1)
# Run imputations
KNN_impute = KNNImputer(n_neighbors=n_neighbors, weights="distance")
df_imp = KNN_impute.fit_transform(df_imp)
df_imp = pd.DataFrame(df_imp, columns=[X_cols, 'Target'])
# Drop target variable
df_imp = df_imp.drop(labels='Target', axis=1)
# Drop non-imputed data
df_X = df_X.drop(labels=X_cols, axis=1)
# Merge to full data
df_X = pd.concat([df_X, df_imp], axis=1)
return df_X
Changing date features to numeric, if one decides to use time as a ordinal feature
def change_date(df, var_str):
# Convert to ordinal
df[var_str] = df[var_str].apply(lambda x: dt.strptime(x, '%m/%Y').toordinal())
return df
One Hot Encoding
Converts categorical variables to dummy variables
def onehotencoding(df):
columns = df.columns[df.isnull().any()]
nan_cols = df[columns]
df = df.drop(nan_cols.columns, axis=1)
df_cat = df.select_dtypes(include=['object'])
onehot = pd.get_dummies(df_cat)
df_cont = df.drop(df_cat.columns, axis=1)
df = pd.concat([df_cont,onehot,nan_cols], axis=1).reset_index(drop=True)
return df
Dimension Reduction on Macroeconomic variables
def pca_fred(X_train=X_train, X_val=X_val, X_test=X_test, n_components=5):
# FRED Subset
columns = ['Household Financial Obligations (Qtr)', 'Household Financial Obligations (Yr)',
'Consumer Debt Service Payment (Qtr)', 'Consumer Debt Service Payment (Yr)',
'National Home Price Index (Qtr)', 'National Home Price Index (Yr)',
'Mortgage Debt Service Payments (Qtr)', 'Mortgage Debt Service Payments (Yr)',
'Monthly Supply of Houses (Qtr)', 'Monthly Supply of Houses (Yr)',
'Vacant Housing Units for Sale (Qtr)', 'Vacant Housing Units for Sale (Yr)',
'Homeownership Rate (Qtr)', 'Homeownership Rate (Yr)', 'Vacant Housing Units for Rent (Qtr)',
'Vacant Housing Units for Rent (Yr)', 'Rental Vacancy Rate (Qtr)', 'Rental Vacancy Rate (Yr)']
fred_train = X_train[columns]
fred_val = X_val[columns]
fred_test = X_test[columns]
# Fit PCA
dimredu = PCA(n_components=n_components, random_state=2020).fit(fred_train)
fred_train = pd.DataFrame(dimredu.transform(fred_train), columns=['Macroeconomy PCA 1',
'Macroeconomy PCA 2',
'Macroeconomy PCA 3',
'Macroeconomy PCA 4',
'Macroeconomy PCA 5'])
fred_val = pd.DataFrame(dimredu.transform(fred_val), columns=['Macroeconomy PCA 1',
'Macroeconomy PCA 2',
'Macroeconomy PCA 3',
'Macroeconomy PCA 4',
'Macroeconomy PCA 5'])
fred_test = pd.DataFrame(dimredu.transform(fred_test), columns=['Macroeconomy PCA 1',
'Macroeconomy PCA 2',
'Macroeconomy PCA 3',
'Macroeconomy PCA 4',
'Macroeconomy PCA 5'])
# Subsitute PCA columns
X_train = X_train.drop(labels=columns, axis=1)
X_train = pd.concat([X_train, fred_train], axis=1)
X_val = X_val.drop(labels=columns, axis=1)
X_val = pd.concat([X_val, fred_val], axis=1)
X_test = X_test.drop(labels=columns, axis=1)
X_test = pd.concat([X_test, fred_test], axis=1)
return X_train, X_val, X_test
Feature Selection
Run permutation importance and score based on ROC-AUC
def relative_importance(X_train, y_train, bank_str, method='bal', max_features='sqrt'):
# Transform X
## define datasets
y = y_train
X = X_train
## Standardize Vars
X_cols = X.columns
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
# Permutation importance for feature evaluation
if method=='bal':
clf = BalancedRandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features,
replacement=False, n_jobs=-1)
elif method=='wgt':
clf = RandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features,
class_weight={1: 0.08, 0: 0.02}, n_jobs=-1)
elif method=='rus':
clf = RUSBoostClassifier(n_estimators=10, random_state=2020)
clf = clf.fit(X, y)
result = permutation_importance(clf, X, y, n_repeats=5, scoring='f1',
random_state=2020)
# Save results
importances = pd.Series(result.importances_mean, index=X_cols)
# Graph
sorted_idx = importances.argsort()
y_ticks = np.arange(0, 15)
fig, ax = plt.subplots()
if importances.size > 15:
ax.barh(y_ticks, importances[sorted_idx].iloc[-15:])
ax.set_yticklabels(importances[sorted_idx].index[-15:])
elif importances.size > 10:
ax.barh(y_ticks, importances[sorted_idx].iloc[-10:])
ax.set_yticklabels(importances[sorted_idx].index[-10:])
elif importances.size > 7:
ax.barh(y_ticks, importances[sorted_idx].iloc[-7:])
ax.set_yticklabels(importances[sorted_idx].index[-7:])
ax.set_yticks(y_ticks)
ax.set_title(str('Feature Importances for\n' + bank_str))
fig.tight_layout()
print(plt.show())
return importances
Feature Selection
Recursive feature elimination
def RFE_importance(X_train, y_train, bank_str, method='bal', max_features='sqrt'):
# Transform X
## define datasets
y = y_train
X = X_train
## Standardize Vars
X_cols = X.columns
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
# Permutation importance for feature evaluation
if method=='bal':
clf = BalancedRandomForestClassifier(n_estimators=200, random_state=2020, max_features=max_features,
replacement=False, n_jobs=-1)
elif method=='wgt':
clf = RandomForestClassifier(n_estimators=200, random_state=2020, max_features=max_features,
class_weight={1: 0.08, 0: 0.02}, n_jobs=-1)
elif method=='rus':
clf = RUSBoostClassifier(n_estimators=200, learning_rate=0.1, random_state=2020)
# Cross-Validate RFE
result = RFECV(clf, min_features_to_select=7, step=1, cv=StratifiedKFold(2),
scoring='f1', n_jobs=-1)
# Fit Model
result.fit(X, y)
# Preview Results
print("Optimal number of features : %d" % result.n_features_)
print("Best F1 Score: {:.2f}".format(max(result.grid_scores_)))
# Save results
importances = pd.Series(result.ranking_, index=X_cols)
sorted_idx = importances.argsort()
df = pd.DataFrame(importances[sorted_idx], columns=['Ranking'])
subset = result.support_
# Graph
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("F1 Score")
plt.plot(range(1, len(result.grid_scores_) + 1), result.grid_scores_)
print(plt.show())
return df, subset
Vote by Committee
def vote_by_committee(X_train, y_train, bank_str, method='bal', max_features='sqrt'):
# Transform X
## define datasets
y = y_train
X = X_train
## Standardize Vars
X_cols = X.columns
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
# Permutation importance for feature evaluation
if method=='bal':
clf = BalancedRandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features,
replacement=False, n_jobs=-1)
elif method=='wgt':
clf = RandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features,
class_weight={1: 0.08, 0: 0.02}, n_jobs=-1)
elif method=='rus':
clf = RUSBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=2020)
# Cross-Validate RFE
result = RFECV(clf, min_features_to_select=7, step=1, cv=StratifiedKFold(2),
scoring='f1', n_jobs=-1)
# Fit Model
result.fit(X, y)
# Preview Results
print("Optimal number of features : %d" % result.n_features_)
print("Best F1 Score: {:.2f}".format(max(result.grid_scores_)))
# Save results
importances = pd.Series(result.ranking_, index=X_cols)
sorted_idx = importances.argsort()
df = pd.DataFrame(importances[sorted_idx], columns=['Ranking'])
subset = result.support_
# Graph
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("F1 Score")
plt.plot(range(1, len(result.grid_scores_) + 1), result.grid_scores_)
print(plt.show())
return df, subset
Foreclosure Descriptive Statistics
def Overall_Data(YrQtr = "", subset = "", df = df, fmaevars = False, allvars = False, quarter = False):
# Subset by Date
if (YrQtr == ""):
df_sub = pd.DataFrame(df)
else:
yr = int(str(YrQtr)[:4])
df_sub = pd.DataFrame(df)
df_sub = df_sub.loc[df_sub['File Year']==yr,:]
if quarter:
qtr = str(YrQtr)[4:6]
df_sub = df_sub.loc[df_sub['File Quarter']==qtr,:]
# Subset by other variable
if (subset != ""):
df_sub = df_sub.loc[eval(subset),:]
# Foreclosures represented
Foreclosed = ['Did not Foreclose', 'Foreclosed']
Target = df_sub.groupby(['Foreclosed']).size().reset_index(name='Total')
if fmaevars or allvars:
# Original Mortgage Amount
ORM = df_sub.groupby(['Foreclosed']).agg({'Original Mortgage Amount': 'mean'}).round(2)
# Credit Score
CS = df_sub.groupby(['Foreclosed']).agg({'Credit Score': 'mean'}).astype(int)
# Debt-to-Income
DTI = df_sub.groupby(['Foreclosed']).agg({'Debt-to-Income': 'mean'}).round(1)
# First Time Home Buyer
FT = df_sub[df_sub['First Time Home Buyer']=='Y'].groupby(['Foreclosed']).size().reset_index(name='Total')
# Refinance
LP = df_sub[df_sub['Loan Purpose']!=0].groupby(['Foreclosed']).size().reset_index(name='Total')
# Original Interest Rate
IR = df_sub.groupby(['Foreclosed']).agg({'Original Interest Rate': 'mean'}).round(2)
# Original Loan Term
LT = df_sub.groupby(['Foreclosed']).agg({'Original Loan Term': 'mean'}).astype(int)
# Loan-to-Value (LTV)
CLTV = df_sub.groupby(['Foreclosed']).agg({'Loan-to-Value (LTV)': 'mean'}).round(1)
# Single Borrower Ratio
SBR = df_sub.groupby(['Foreclosed']).agg({'Single Borrower': 'mean'}).round(2)
# Mortgage Insurance %
MIP = df_sub.groupby(['Foreclosed']).agg({'Mortgage Insurance %': 'mean'}).round(2)
if allvars:
# Median Household Income
MHI = df_sub.groupby(['Foreclosed']).agg({'Median Household Income': 'mean'}).round(2)
# Loan Change (1 Yr)
LC1 = df_sub.groupby(['Foreclosed']).agg({'Loan Change (1 Yr)': 'mean'}).round(2)
# Loan Change (5 Yr)
LC5 = df_sub.groupby(['Foreclosed']).agg({'Loan Change (5 Yr)': 'mean'}).round(2)
# Loan Liabilities (1 Yr)
LNL1 = df_sub.groupby(['Foreclosed']).agg({'Loan Liabilities (1 Yr)': 'mean'}).round(2)
# Loan Liabilities (5 Yr)
LNL5 = df_sub.groupby(['Foreclosed']).agg({'Loan Liabilities (5 Yr)': 'mean'}).round(2)
# Create Dataset
df_new = pd.DataFrame({ 'Foreclosed': Foreclosed,
'Foreclosed (%)': ((Target['Total'] / df_sub.shape[0]) * 100).round(1),
'Foreclosed (N)': df_sub.groupby(['Foreclosed']).size()
})
if fmaevars or allvars:
df_all = pd.DataFrame({ 'Original Mortgage Amount': ORM['Original Mortgage Amount'].tolist(),
'Credit Score': CS['Credit Score'].tolist(),
'Debt-to-Income': DTI['Debt-to-Income'].tolist(),
'First Time Home Buyer (%)': ((FT['Total'] / Target['Total']) * 100).round(1).tolist(),
'Refinanced': ((LP['Total'] / Target['Total']) * 100).round(1).tolist(),
'Interest Rate': IR['Original Interest Rate'].tolist(),
'Loan Term': LT['Original Loan Term'].tolist(),
'Loan-to-Value (LTV)': CLTV['Loan-to-Value (LTV)'].tolist(),
'Single Borrower Ratio': SBR['Single Borrower'].tolist(),
'Mortgage Insurance %': MIP['Mortgage Insurance %'].tolist(),
})
df_new = pd.concat([df_new, df_all], axis=1)
if allvars:
df_all = pd.DataFrame({
'Median Household Income': MHI['Median Household Income'].tolist(),
'Loan Change (1 Yr)': LC1['Loan Change (1 Yr)'].tolist(),
'Loan Change (5 Yr)': LC5['Loan Change (5 Yr)'].tolist(),
'Loan Liabilities (1 Yr)': LNL1['Loan Liabilities (1 Yr)'].tolist(),
'Loan Liabilities (5 Yr)': LNL5['Loan Liabilities (5 Yr)'].tolist()
})
df_new = pd.concat([df_new, df_all], axis=1)
df_new = df_new.set_index('Foreclosed')
return df_new
Bank Descriptive Statistics
def Bank_Data(YrQtr = "", subset = "", df = df, fmaevars = False, allvars = False,
quarter = False, rounding = 1):
# Subset by Date
if (YrQtr == ""):
df_sub = pd.DataFrame(df)
else:
yr = int(str(YrQtr)[:4])
df_sub = pd.DataFrame(df)
df_sub = df_sub.loc[df_sub['File Year']==yr,:]
if quarter:
qtr = str(YrQtr)[4:6]
df_sub = df_sub.loc[df_sub['File Quarter']==qtr,:]
# Subset by other variable
if (subset != ""):
df_sub = df_sub.loc[eval(subset),:]
# Banks represented
Banks = df_sub.groupby(['Bank']).size().reset_index(name='Total')
# Foreclosures
Target = df_sub.groupby(['Bank']).agg({'Foreclosed': 'mean'})
# Fannie Mae variables
if fmaevars or allvars:
# Original Mortgage Amount
ORM = df_sub.groupby(['Bank']).agg({'Original Mortgage Amount': 'mean'}).round(2)
# Credit Score
CS = df_sub.groupby(['Bank']).agg({'Credit Score': 'mean'}).astype(int)
# Debt-to-Income
DTI = df_sub.groupby(['Bank']).agg({'Debt-to-Income': 'mean'}).round(1)
# First Time Home Buyer
FT = df_sub[df_sub['First Time Home Buyer']=='Y'].groupby(['Bank']).size().reset_index(name='Total')
# Refinance
LP = df_sub[df_sub['Loan Purpose']!=0].groupby(['Bank']).size().reset_index(name='Total')
# Original Interest Rate
IR = df_sub.groupby(['Bank']).agg({'Original Interest Rate': 'mean'}).round(2)
# Original Loan Term
LT = df_sub.groupby(['Bank']).agg({'Original Loan Term': 'mean'}).astype(int)
# Loan-to-Value (LTV)
CLTV = df_sub.groupby(['Bank']).agg({'Loan-to-Value (LTV)': 'mean'}).round(1)
# Single Borrower Ratio
SBR = df_sub.groupby(['Bank']).agg({'Single Borrower': 'mean'}).round(2)
# Mortgage Insurance %
MIP = df_sub.groupby(['Bank']).agg({'Mortgage Insurance %': 'mean'}).round(2)
# All other variables
if allvars:
# Median Household Income
MHI = df_sub.groupby(['Bank']).agg({'Median Household Income': 'mean'}).round(2)
# Loan Change (1 Yr)
LC1 = df_sub.groupby(['Bank']).agg({'Loan Change (1 Yr)': 'mean'}).round(2)
# Loan Change (5 Yr)
LC5 = df_sub.groupby(['Bank']).agg({'Loan Change (5 Yr)': 'mean'}).round(2)
# Loan Liabilities (1 Yr)
LNL1 = df_sub.groupby(['Bank']).agg({'Loan Liabilities (1 Yr)': 'mean'}).round(2)
# Loan Liabilities (5 Yr)
LNL5 = df_sub.groupby(['Bank']).agg({'Loan Liabilities (5 Yr)': 'mean'}).round(2)
# Create Dataset
df_new = pd.DataFrame({ 'Bank': Banks['Bank'],
'Bank (%)': ((Banks['Total'] / df_sub.shape[0]) * 100).round(1),
'Bank (N)': Banks['Total'],
'Foreclosed (%)': ((Target['Foreclosed'] * 100).round(rounding)).tolist(),
})
if fmaevars or allvars:
df_all = pd.DataFrame({ 'Original Mortgage Amount': ORM['Original Mortgage Amount'].tolist(),
'Credit Score': CS['Credit Score'].tolist(),
'Debt-to-Income': DTI['Debt-to-Income'].tolist(),
'First Time Home Buyer (%)': ((FT['Total'] / Banks['Total']) * 100).round(1).tolist(),
'Refinanced': ((LP['Total'] / Banks['Total']) * 100).round(1).tolist(),
'Interest Rate': IR['Original Interest Rate'].tolist(),
'Loan Term': LT['Original Loan Term'].tolist(),
'Loan-to-Value (LTV)': CLTV['Loan-to-Value (LTV)'].tolist(),
'Single Borrower Ratio': SBR['Single Borrower'].tolist(),
'Mortgage Insurance %': MIP['Mortgage Insurance %'].tolist(),
})
df_new = pd.concat([df_new, df_all], axis=1)
if allvars:
df_all = pd.DataFrame({
'Median Household Income': MHI['Median Household Income'].tolist(),
'Loan Change (1 Yr)': LC1['Loan Change (1 Yr)'].tolist(),
'Loan Change (5 Yr)': LC5['Loan Change (5 Yr)'].tolist(),
'Loan Liabilities (1 Yr)': LNL1['Loan Liabilities (1 Yr)'].tolist(),
'Loan Liabilities (5 Yr)': LNL5['Loan Liabilities (5 Yr)'].tolist()
})
df_new = pd.concat([df_new, df_all], axis=1)
df_new = df_new.set_index("Bank")
return df_new
Isolate banks based on maximum, minimum, or other meaningful values
def search_Banks(col, df = Banks, func = max, subset = True):
# print(col, func.__name__, "value")
if (subset): cols = col
else: cols = df.columns
values = pd.DataFrame(df[cols][df[col] == func(df[col])])
return values
Density plot of a feature
def feature_density(var, hist=True, bins=None, l_xlim=None, r_xlim=None):
fig, ax = plt.subplots(1, 1, figsize=(10,5))
sns.distplot(df.loc[df['Foreclosed']==1, var], color='#51c0ef', hist=hist, bins=bins)
sns.distplot(df.loc[df['Foreclosed']==0, var], color='#61ba86', hist=hist, bins=bins)
ax.legend(labels=['Foreclosed', 'Did not Foreclose'], loc='upper right')
ax.set_xlim(left=l_xlim, right=r_xlim)
plt.show()
print(Overall_Data(df = df, allvars = True)[[var]])
Rank features of each bank via bar chart
def bank_rank_gph(var, df, df_bank, b_ylim=None, t_ylim=None):
v = df[var].mean().astype(int)
avg = pd.DataFrame(data={var: v}, index=['All Banks'])
tbl = pd.concat([avg, df_bank.sort_values(by=[var])], axis=0)
display(tbl)
fig, ax = plt.subplots(1, 1, figsize=[12,14])
plt.bar(tbl.index, tbl.loc[:,var], color = '#457b9d')
plt.bar('All Banks', tbl.loc['All Banks',var], color = '#ca2c92')
plt.xticks(rotation=90)
ax.tick_params(axis='both', labelsize=16)
ax.set_ylim(bottom=b_ylim, top=t_ylim)
ax.set_ylabel(var, fontsize=20)
ax.set_xlabel('Bank', fontsize=20)
plt.title(var + '\n2006 - 2008', fontsize=20)
plt.show()
Bar graphs of the best and worst banks for a given feature
def best_worst_gph(var, df, func=[max, min], l_xlim=None, r_xlim=None):
best = search_Banks(var, df = df, func = func[0])
worst = search_Banks(var, df = df, func = func[1])
v = df[var].mean().astype(int)
avg = pd.DataFrame(data={var: v}, index=['Overall'])
label = pd.Series(['Worst Bank', 'Best Bank', 'Average'])
tbl = pd.concat([worst, best, avg], axis=0).reset_index()
tbl.columns = ['Bank', var]
tbl.index = label
display(tbl)
fig, ax = plt.subplots(1, 1, figsize=[9,3])
plt.barh(tbl.index, tbl.loc[:,var], color = '#457b9d')
plt.barh('Average', tbl.loc['Average',var], color = '#ca2c92')
plt.xticks(rotation=90)
ax.set_xlim(left=l_xlim, right=r_xlim)
plt.title(var + '\n2006 - 2008')
plt.show()
Density plots of best and worst banks for a given feature
def best_worst_density(var, df, df_bank, func=[max, min],
hist = True, bins = None, l_xlim = None, r_xlim = None):
best = search_Banks(var, df = df_bank, func = func[0]).reset_index()
worst = search_Banks(var, df = df_bank, func = func[1]).reset_index()
title = ['Best Actor', 'Worst Actor']
i = 0
for v in [best.loc[0,'Bank'], worst.loc[0,'Bank']]:
fig, ax = plt.subplots(1, 1, figsize=(10,5))
sns.distplot(df.loc[(df['Foreclosed']==1) & (df['Bank']==v), var], color='#51c0ef', hist=hist, bins=bins)
sns.distplot(df.loc[(df['Foreclosed']==0) & (df['Bank']==v), var], color='#61ba86', hist=hist, bins=bins)
plt.title(title[i] + '\n' + v)
ax.set_xlim(left=l_xlim, right=r_xlim)
ax.legend(labels=['Foreclosed', 'Did not Foreclose'], loc='upper left')
plt.show()
i += 1
Create single-bank only subsets
# List of banks for reference
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
'AmTrust Bank','PNC Bank','Flagstar Bank']
# Function to subset banking datasets
def Bank_Subsets(bank_strs, df_X = X_train, df_y = y_train):
# Initiate Bank dictionaries
X = {}
y = {}
# Bank Subset
for bank_str in bank_strs:
X[bank_str] = df_X.loc[df_X[str('Bank_' + bank_str)]==1, :] \
.filter(regex=r'^(?!Bank_).*$')
y[bank_str] = df_y[np.array(df_X[str('Bank_' + bank_str)]==1)]
return X, y
Update single-bank only subsets with predicted probability assumptions
# Create Data Frames with Updated Assumptions
def pp_dfs(varlist, values, banks,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test):
# Initiate Dictionaries
X = {}
y = {}
for bank_str in banks:
# Bank-specific concatenation of train, val, and test data
X[bank_str] = pd.concat([Banks_X[bank_str], Banks_X_val[bank_str], Banks_X_test[bank_str]], axis=0)
y[bank_str] = pd.concat([Banks_y[bank_str], Banks_y_val[bank_str], Banks_y_test[bank_str]], axis=0)
# Bank-specific update prediction assumption
for i in range(len(varlist)):
X[bank_str][varlist[i]] = values[i]
# All bank concatenation of train, val, and test data
X['All Banks'] = pd.concat([X_train, X_val, X_test], axis=0)
y['All Banks'] = pd.concat([y_train, y_val, y_test], axis=0)
# All bank update prediction assumption
for i in range(len(varlist)):
X['All Banks'][varlist[i]] = values[i]
return X, y
Set Up Four Types of Models
For the bottom layer
# Standarize Vars
sclr = StandardScaler()
## RFC
## 5:1 balance
## Subset columns to 20%
rfc1_m = BalancedRandomForestClassifier(random_state=2020,
max_features='sqrt', criterion='entropy',
sampling_strategy={0:1500, 1:300},
replacement=False, n_jobs=-1)
rfc1 = Pipeline(steps=[('sclr', sclr), ('rfc1', rfc1_m)])
## RFC
## PCA reduction to 10 columns
## Fully balanced
rfc2_m = BalancedRandomForestClassifier(random_state=2022,
max_features=None, criterion='entropy',
sampling_strategy='auto',
replacement=True, n_jobs=-1)
rfc2 = Pipeline(steps=[('sclr', sclr), ('pca', PCA()), ('rfc2', rfc2_m)])
## RUS Boost
## 3:1 balance
rus_m = RUSBoostClassifier(n_estimators=500, random_state=2023,
sampling_strategy={0:900, 1:300},
replacement=False)
rus = Pipeline(steps=[('sclr', sclr), ('rus', rus_m)])
## Keras
## 5 Layer model
## Fully balanced
def make_model(n_features):
model = Sequential()
model.add(Dense(500, input_shape=(n_features,),
kernel_initializer='glorot_normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(250, kernel_initializer='glorot_normal', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))
#model.add(Dense(500, kernel_initializer='glorot_normal', use_bias=False))
#model.add(BatchNormalization())
#model.add(Activation('relu'))
#model.add(Dropout(0.15))
model.add(Dense(100, kernel_initializer='glorot_normal', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.05))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def fit_predict_balanced_model(X, y):
model = make_model(X.shape[1])
training_generator = BalancedBatchGenerator(X, y,
batch_size=2000,
random_state=2024)
model.fit_generator(generator=training_generator, epochs=1000, verbose=0)
return model
keras = KerasClassifier(build_fn=fit_predict_balanced_model, verbose=0)
keras = Pipeline(steps=[('sclr', sclr), ('pca', PCA(n_components=10)), ('keras', keras)])
## Create an environment variable to avoid using the GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
Function for bottom layer models
Runs on individual bank models and all banks model
# Function for bottom layer models
def bottom_layer_func(bank_str, estimator, clf_str,
Banks_y, Banks_X,
Banks_y_val, Banks_X_val,
Banks_y_test, Banks_X_test):
print('==============================')
print('==============================')
print(bank_str, 'Model')
print(' - ', estimator)
# Load Bank Data
y_train = Banks_y[bank_str]
X_train = Banks_X[bank_str]
y_val = Banks_y_val[bank_str]
X_val = Banks_X_val[bank_str]
y_test = Banks_y_test[bank_str]
X_test = Banks_X_test[bank_str]
# Param tuning
rfc1_param_grid = {
'rfc1__n_estimators': [50, 75],
'rfc1__min_samples_split': [2, 4, 6]
}
rfc2_param_grid = {
'pca__n_components': [10, 15],
'rfc2__n_estimators': [500, 750],
'rfc2__min_samples_split': [10, 13, 16]
}
rus_param_grid = {
'rus__learning_rate': [0.05, 0.1, 0.15, 0.2]
}
# Set for classifier
if clf_str == 'RFC':
param_grid=rfc1_param_grid
elif clf_str == 'RFC PCA':
param_grid=rfc2_param_grid
elif clf_str == 'RUS Boost':
param_grid=rus_param_grid
else:
param_grid={}
# Model
if clf_str == 'Keras NN':
model = fit_predict_balanced_model(X_train, y_train)
# Determine Thresholds
proba_val = pd.DataFrame(model.predict_proba(X_val, batch_size=2000)).iloc[:,0]
print('Best Threshold')
thres = threshold(y_val, proba_val)
# Classification
proba_test = pd.DataFrame(model.predict_proba(X_test, batch_size=2000)).iloc[:,0]
pred = proba_test.map(lambda x: 1 if x >= thres['Threshold'] else 0)
else:
model = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
CV = pickle.dumps(model)
model = pickle.loads(CV)
model.fit(X_train, y_train)
print('Best Params')
print(model.best_params_)
# Determine Thresholds
proba_val = pd.DataFrame(model.predict_proba(X_val)).loc[:,1]
print('Best Threshold')
thres = threshold(y_val, proba_val)
# Classification
proba_test = pd.DataFrame(model.predict_proba(X_test)).loc[:,1]
pred = proba_test.map(lambda x: 1 if x >= thres['Threshold'] else 0)
# Prediction on test
print(target_values(pred, prediction=True))
print('F1 Score', f1_score(y_test, pred).round(2))
print('')
print('Confusion Matrix')
print(confusion_matrix(y_test, pred))
print('')
return model, thres['Threshold'], proba_test, pred
Classifier Function
Wrapper for running and saving bottom layer model
# Classifier Function
def clf_pred_func(bnk_list, clfs, clfs_str,
Banks_y, Banks_X,
Banks_y_val, Banks_X_val,
Banks_y_test, Banks_X_test):
## Model Dictionaries
vote_models = {}
vote_thresholds = {}
vote_proba = {}
vote_pred = {}
## Model for Classifier Predictions
for bank_str in bnk_list:
vote_models[bank_str] = {}
vote_thresholds[bank_str] = {}
vote_proba[bank_str] = {}
vote_pred[bank_str] = {}
for i in range(len(clfs)):
vote_models[bank_str][clfs_str[i]], \
vote_thresholds[bank_str][clfs_str[i]], \
vote_proba[bank_str][clfs_str[i]], \
vote_pred[bank_str][clfs_str[i]] = bottom_layer_func(bank_str = bank_str, \
estimator = clfs[i], \
clf_str = clfs_str[i], \
Banks_y = Banks_y, Banks_X = Banks_X, \
Banks_y_val = Banks_y_val, Banks_X_val = Banks_X_val, \
Banks_y_test = Banks_y_test, Banks_X_test = Banks_X_test)
return vote_models, vote_thresholds, vote_proba, vote_pred
Vote classifier function
Saves prediction of bottom layer into dataframes for voting in middle and top layers
# Vote classifier
## Voting dataframes function
def votes_clf_func(vote_pred, bnk_list, clfs_str, X):
# Create dictionary of dataframes for voting
votes = {}
# Save all classifier predictions into a dataframe for a given bank
for bank_str in bnk_list:
votes[bank_str] = pd.DataFrame()
# Subset all bank data by bank
for clf in clfs_str:
votes[bank_str].loc[:,clf] = vote_pred[bank_str][clf]
return votes
Prediction function; includes bottom, middle, and top layer.
# Predicted Probabilities
def proba_func(X, banks_str, bank_idx, clfs_str,
vote_models, vote_thresholds):
# Initiate Dictionary
votes = {}
# Bottom Layer
## Bank-specific
for bank_str in banks_str:
# Initiate Data Frames
votes[bank_str] = pd.DataFrame()
## Classifier-specific
for clf in clfs_str:
### Predicted Probility
proba = pd.DataFrame(vote_models[bank_str][clf].predict_proba(X[bank_str])).iloc[:,1]
### Classification
votes[bank_str][clf] = proba.map(lambda x: 1 if x >= vote_thresholds[bank_str][clf] else 0)
# Middle Layer
all_bnks_pred = ( votes['All Banks'].iloc[:,:len(clfs_str)].sum(axis=1) /
len(clfs_str) ) \
.map(lambda x: 1 if x == 1 else 0)
for bank_str in banks_str:
votes[bank_str].loc[:,'All Banks'] = all_bnks_pred.loc[bank_idx == bank_str].reset_index().iloc[:,1]
# Top Layer
for bank_str in banks_str:
votes[bank_str].loc[:,'Majority'] = ( votes[bank_str].iloc[:,:(len(clfs_str)+1)].sum(axis=1) /
( len(clfs_str)+1 ) ) \
.map(lambda x: 1 if x > 0.67 else 0)
return votes
Changes assumptions in models by select a variable or multiple columns and changing their values while keeping other variable values the same, while predicting foreclosures.
def changing_assumptions(varlist, percentile, banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test):
# Ensure cols is list
if type(varlist) is not list:
varlist = [varlist]
if type(percentile) is not list:
percentile = [percentile]
# Define value of var using total data's percentile
values = []
for i in range(len(varlist)):
values.append(np.percentile(X[varlist[i]], percentile[i]))
print('Converting', varlist[i], 'to the', percentile[i], 'percentile:', values[i].round(0))
print('')
# Credit Score Data
X_df, y_act = pp_dfs(varlist, values, banks,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test)
# Saved Predicted Probabilities
bank_plus = banks + ['All Banks']
X_pp = proba_func(X_df, bank_plus, bank_idx, clfs_str,
vote_models, vote_thresholds)
# Initiate Data Frames and Dictionaries
final_frcls = pd.DataFrame(columns=['Original Foreclosures', 'Predicted Foreclosures'])
combined_votes = pd.Series()
combined_orig = pd.Series()
for bank_str in banks:
pred = X_pp[bank_str]['Majority']
final_frcls.loc[bank_str, 'Original Foreclosures'] = (np.mean(y_act[bank_str]) * 100).round(1)
final_frcls.loc[bank_str, 'Predicted Foreclosures'] = (np.mean(pred) * 100).round(1)
print(bank_str)
print('Original Foreclosures', final_frcls.loc[bank_str, 'Original Foreclosures'], '%')
print('Predicted Foreclosures', final_frcls.loc[bank_str, 'Predicted Foreclosures'], '%')
print('')
# Combine banks
combined_votes = pd.concat([combined_votes, pred], axis=0)
combined_orig = pd.concat([combined_orig, y_act[bank_str]], axis=0)
pred = X_pp['All Banks']['Majority']
final_frcls.loc['All Banks', 'Original Foreclosures'] = (np.mean(combined_orig) * 100).round(1)
final_frcls.loc['All Banks', 'Predicted Foreclosures'] = (np.mean(combined_votes) * 100).round(1)
print('All Banks')
print('Original Foreclosures', final_frcls.loc['All Banks', 'Original Foreclosures'], '%')
print('Predicted Foreclosures', final_frcls.loc['All Banks', 'Predicted Foreclosures'], '%')
return final_frcls, values
Table of foreclosures before and after changed assumptions (i.e., predicted probabilities were run)
def changed_assumptions_tbl(var, improved, weakened):
tbl = pd.concat([improved[var], weakened[var].iloc[:,1]], axis=1)
header = [['Foreclosures', var[:23], var[:23]],
['(2006-2008)', 'Improved', 'Weakened']]
tbl = pd.DataFrame(data=tbl.values, columns=header, index=tbl.index)
return tbl
Bar graph of foreclosures before and after changed assumptions (i.e., predicted probabilities were run)
Only best and worst banks for a given feature a graphed
def predicted_gph(var, proba, proba_value, improved=True,
func = [max, min], l_xlim=None, r_xlim=None, round=0):
proba[var]['Difference'] = proba[var]['Original Foreclosures']-proba[var]['Predicted Foreclosures']
best_bnk = search_Banks('Difference', df = proba[var], func = func[0]).index[0]
worst_bnk = search_Banks('Difference', df = proba[var], func = func[1]).index[0]
best_orig = proba[var]['Original Foreclosures'].loc[best_bnk]
best_new = proba[var]['Predicted Foreclosures'].loc[best_bnk]
worst_orig = proba[var]['Original Foreclosures'].loc[worst_bnk]
worst_new = proba[var]['Predicted Foreclosures'].loc[worst_bnk]
all_orig = proba[var]['Original Foreclosures'].loc['All Banks']
all_new = proba[var]['Predicted Foreclosures'].loc['All Banks']
if improved:
labels = ['All Banks', str(best_bnk + '\nMost Improved'), str(worst_bnk + '\nLeast Improved')]
else:
labels = ['All Banks', str(best_bnk + '\nMost Weakened'), str(worst_bnk + '\nLeast Weakened')]
orig = [all_orig, best_orig, worst_orig]
imp = [all_new, best_new, worst_new]
y = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(1, 1, figsize=[8,3])
if improved:
bar1 = ax.barh(y + width/2, imp, width, label='Improved Assumption', color = 'darkgreen')
else:
bar1 = ax.barh(y + width/2, imp, width, label='Weakened Assumption', color = 'maroon')
bar2 = ax.barh(y - width/2, orig, width, label='Original Score', color = '#457b9d') # 457b9d
ax.set_xlim(left=l_xlim, right=r_xlim)
ax.set_yticks(y)
ax.set_xlabel('Foreclosure Rate')
ax.invert_yaxis()
ax.set_yticklabels(labels)
if improved:
ax.text(1.075, 0, str('Improved assumption changed\n' + var + ' to ' + str(proba_value[var][0].round(round))),
fontsize='large', transform=ax.transAxes)
else:
ax.text(1.075, 0, str('Weakened assumption changed\n' + var + ' to ' + str(proba_value[var][0].round(round))),
fontsize='large', transform=ax.transAxes)
plt.title('Adjusting ' + var + '\nAssumptions')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
Function to plot target variable (and predictions)
Visualize the percentage and frequency of target variable
def target_values(df_depvar, data=False, prediction=False):
# save target frequencies
target_frequency = df_depvar.value_counts()
# save target percentage
target_percentage = round((df_depvar.value_counts()/df_depvar.count())*100).astype(int)
# graphing target variable
jtplot.style(ticks=True, grid=False)
plt.figure(figsize=(14,4))
target_percentage.plot.barh(stacked=True, color='#ca2c92').invert_yaxis()
if data:
plt.suptitle('Bar Chart of Target Variable', fontsize=18)
elif prediction:
plt.suptitle('Bar Chart of Predictions', fontsize=18)
else:
plt.suptitle('Percent of Mortage Defaults', fontsize=18)
plt.ylabel('Foreclosed')
plt.xlabel('Percentage')
plt.xlim([0,100])
# plt.yticks([0, 1], ['Did not Foreclose', 'Foreclosed'])
plt.show()
# display frequency of foreclosures
print('Frequency of Foreclosures\n', target_frequency, '\n', sep='')
# display percentage of foreclosures
print('Percentage of Foreclosures\n', target_percentage, '\n', sep='')
Visualize scores at various classification thresholds
def threshold(y_test, target_prob):
# Determine threshold
threshold = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
acc = []
prec = []
f1 = []
auc = []
best_auc = {'Threshold': 0.5, 'Best ROC AUC Score': 0.0}
best_acc = {'Threshold': 0.5, 'Best Accuracy Score': 0.0}
best_prec = {'Threshold': 0.5, 'Best Precision Score': 0.0}
best_f1 = {'Threshold': 0.5, 'Best F1 Score': 0.0}
for i in range(len(threshold)):
y_pred = target_prob.map(lambda x: 1 if x >= threshold[i] else 0)
# Accuracy
acc.append(accuracy_score(y_test, y_pred))
# Precision
prec.append(precision_score(y_test, y_pred))
# F1
f1.append(f1_score(y_test, y_pred))
# AUC
auc.append(roc_auc_score(y_test, y_pred))
# Save best accuracy
if (best_acc['Best Accuracy Score'] < acc[i]):
best_acc = {'Threshold': threshold[i], 'Best Accuracy Score': acc[i]}
# Save best precision
if (best_prec['Best Precision Score'] < prec[i]):
best_prec = {'Threshold': threshold[i], 'Best Precision Score': prec[i]}
# Save best f1
if (best_f1['Best F1 Score'] < f1[i]):
best_f1 = {'Threshold': threshold[i], 'Best F1 Score': f1[i]}
# Save best Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
if (best_auc['Best ROC AUC Score'] < auc[i]):
best_auc = {'Threshold': threshold[i], 'Best ROC AUC Score': auc[i]}
# Plot
df_plot = pd.DataFrame({'Threshold': threshold, 'Accuracy': acc,
'Precision': prec, 'ROC AUC': auc, 'F1': f1})
plt.figure(figsize=(12,4))
plt.plot(df_plot['Threshold'], df_plot.iloc[:,1:5])
plt.title('Scores at Various Thresholds')
plt.legend(['Accuracy', 'Precision', 'ROC AUC', 'F1'])
plt.show()
# Scores
y_pred = target_prob.map(lambda x: 1 if x >= best_f1['Threshold'] else 0)
print(classification_report(y_test, y_pred))
return( best_f1 )
Plot Target Classes
Visualizes if there are any obvious classification boundaries
def plot_2d_space(X, y, label='Classes'):
colors = ['#1F77B4', '#FF7F0E']
markers = ['o', 's']
for l, c, m in zip(np.unique(y), colors, markers):
plt.scatter(
X[y==l, 0],
X[y==l, 1],
c=c, label=l, marker=m
)
plt.title(label)
plt.legend(loc='upper right')
plt.show()