# The purpose of this exercise is to build a model to qualify bank clients that have more chances of buying a loan. 
# Qualifying customers is really important on phone / tele marketing activities as our cost could increase if we work 
# with poorly qualified leads. It also could increase the risk of annoying customers that are contacted with 
# offers that are not of their interest. 
# To qualify the leads, I will use different methods such as logistic regression, decision tres, 
# random forest and other ensemble methods. 
# I will compare the performance of the models ussing accuracy, ROC and AUC
# and I will explain the results very briefly.

IMPORTING LIBRARIES¶

# Importing libraries

import pandas as pd
import numpy as np
from math import *
import sys

# Importing visualization libraries
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno
plt.style.use( 'ggplot' )
%matplotlib inline

#Import preprocessing libraries
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder, Normalizer , scale

# Ignore warnings 
import warnings
warnings.filterwarnings('ignore')

# I am going to display only 7 columns to avoid problems of overlapping when publishing in web
pd.set_option('display.max_columns', 7)

# Splitting data into training and testing
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.feature_selection import RFECV

# Machine Learning Models

# Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis


# Hyperparameter tuning, kfold and cross validation
from sklearn import model_selection
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, cross_val_score



# evaluation metrics :
# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix, classification_report
#Explanation
from sklearn import tree
from xgboost import XGBClassifier
from xgboost import plot_tree
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

DATA DICTIONARY¶

Input variables:

bank client data:¶

1 – age (numeric) 2 – job : type of job (categorical: ‘admin.’,’blue-collar’,’entrepreneur’,’housemaid’,’management’,’retired’,’self-employed’,’services’,’student’,’technician’,’unemployed’,’unknown’) 3 – marital : marital status (categorical: ‘divorced’,’married’,’single’,’unknown’; note: ‘divorced’ means divorced or widowed) 4 – education (categorical:’basic.4y’,’basic.6y’,’basic.9y’,’high.school’,’illiterate’, ‘professional.course’,’university.degree’,’unknown’) 5 – default: has credit in default? (categorical: ‘no’,’yes’,’unknown’) 6 – housing: has housing loan? (categorical: ‘no’,’yes’,’unknown’) 7 – loan: has personal loan? (categorical: ‘no’,’yes’,’unknown’)

8 – contact: contact communication type (categorical: ‘cellular’,’telephone’) 9 – month: last contact month of year (categorical: ‘jan’, ‘feb’, ‘mar’, …, ‘nov’, ‘dec’) 10 – day_of_week: last contact day of the week (categorical: ‘mon’,’tue’,’wed’,’thu’,’fri’) 11 – duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=’no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

other attributes:¶

12 – campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact) 13 – pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) 14 – previous: number of contacts performed before this campaign and for this client (numeric) 15 – poutcome: outcome of the previous marketing campaign (categorical: ‘failure’,’nonexistent’,’success’)

16 – emp.var.rate: employment variation rate – quarterly indicator (numeric) 17 – cons.price.idx: consumer price index – monthly indicator (numeric) 18 – cons.conf.idx: consumer confidence index – monthly indicator (numeric) 19 – euribor3m: euribor 3 month rate – daily indicator (numeric) 20 – nr.employed: number of employees – quarterly indicator (numeric)

Output variable (desired target): 21 – y – has the client subscribed a term deposit? (binary: ‘yes’,’no’)

READING AND CLEANING DATA¶

# Reading data
bank = pd.read_csv('C:/Users/Hector/Python_blog/bank-additional-full.csv',sep=';')

bank.head( )

# Columns in the dataset
bank.columns

Index([u'age', u'job', u'marital', u'education', u'default', u'housing',
       u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign',
       u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx',
       u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y'],
      dtype='object')

#Dataset dimensions
bank.shape

(41188, 21)

# Data types
# We will need to transform categorical variables into numeric after the EDA
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usage: 6.6+ MB

# Columns stats
bank.describe()
# I will need to eliminate duration from the dataset as per the dictionary recommendation

# Checking nulls
bank.isnull().sum()
# The dataset is quite clean and we don´t have nulls

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

EXPLORATORY DATA ANALYSIS¶

# Preparing the label
# Converting the categorical value into 1 and 0
label=bank[['y']]
label_dummies=pd.get_dummies(label)
y=label_dummies[['y_yes']]
y.head()
bank['y_label']=y['y_yes']
bank.head()

#Duration
# to create a predictive model i will eliminate this variable, as this variable is know ex post, 
# once the campaign is done, if the duration of the telephone call is long enough, chances are
# that the customer has bought the product

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['duration'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['duration'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Duration', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of Duration by Success', size = 28);

#Past Days
#We have two types of clients, those that have been contacted this year, and those for which the last
#Contact was 3 year ago
#

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['pdays'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['pdays'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Past days', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of Pdays by Success', size = 28);

#Previous Campaigns
#Generally speaking A brand new customer is more likely to buy the product than other already targeted
#the success ratio seems to below 50% after the 3rd campaign

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['campaign'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['campaign'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Campaign', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of Campaign by Success', size = 28);

#emp.var.rate
#The product sell seems to be more successfull when the external conditions are though, 
# in this case, when the employment rate decrease, customers tend to subscribe the loan.

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['emp.var.rate'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['emp.var.rate'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Campaign', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of emp.var.rate by Success', size = 28);

# cons.price.idx
# All the historical series is deflactionary
# Customers doesnt seem to take price indexes into consideration, at least directly to subscribe  loans

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['cons.price.idx'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['cons.price.idx'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Campaign', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of cons.price.idx by Success', size = 28);

# euribor3m
# Customers seems to reject more the product when interest rates are high

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['euribor3m'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['euribor3m'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Campaign', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of euribor3m by Success', size = 28);

# nr.employed
# This variable seems to be not related.
# The more employees we have, the more we dedicate to telemarketing, the more the wins
# To create a predictive model we should eliminate this variable, I am going to leave the variable
# for now

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['nr.employed'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['nr.employed'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Campaign', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of nr.employed by Success', size = 28);

# age
# This variable seems to be not related.
# Older than 60 and lower than 30 seems to buy the product more easily

#It is more difficult sell the product between 40 and 45

plt.figure(figsize=(16,7))
# Density plot of Energy Star scores
sns.kdeplot(bank[bank['y_label']==1]['age'],label = 'yes',c='b');
sns.kdeplot(bank[bank['y_label']==0]['age'],label = 'no',c='r');
    
# label the plot
plt.xlabel('Campaign', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of age by Success', size = 28);

#exploratory data analysis  with categorical variables
# There are higher success rate in students and retired, aalthough the bulk fo the wins come from admin, blue collar
# and technician
#job
table=pd.DataFrame(bank.groupby(['job','y']).count()['y_label']).reset_index()
#type(table_job.reset_index)
table.pivot(index='job',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),title='Job and Success');
table=table.pivot(index='job',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='job',y='rate',figsize=(14,7),title='Job and Success rate' );

EDA WITH CATEGORICAL VARIABLES¶

#exploratory data analysis  with categorical variables
# higher success rates in single
#marital
#it seems that the success rate is a little bit higher in divorced and single than in married
table=pd.DataFrame(bank.groupby(['marital','y']).count()['y_label']).reset_index()

table.pivot(index='marital',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),title='Marital and Success');
table=table.pivot(index='marital',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='marital',y='rate',figsize=(14,7),title='Marital and Success rate' );

#marital
# it seems that the success rate is a little bit higher in those with high school or university education
# higheer success rate in univesity degree and professional course
table=pd.DataFrame(bank.groupby(['education','y']).count()['y_label']).reset_index()

table.pivot(index='education',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='Education and Success');
table=table.pivot(index='education',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='education',y='rate',figsize=(14,7),title='Education and Success rate' );

#default
# it seems that the product is not offered to those that have defaulted
# higher success rate and volume in those that have not defaulted.
# Clearly the bank does not grant credit cards to those with bag credit records
table=pd.DataFrame(bank.groupby(['default','y']).count()['y_label']).reset_index()

table.pivot(index='default',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='Default and Success');
table=table.pivot(index='default',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='default',y='rate',figsize=(14,7),title='Default and Success rate' );

# housing
# it seems that those that have house are more prone to buy the loan
table=pd.DataFrame(bank.groupby(['housing','y']).count()['y_label']).reset_index()

table.pivot(index='housing',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='housing and Success');

# loan
# those that do not have loan have more chances of buying
table=pd.DataFrame(bank.groupby(['loan','y']).count()['y_label']).reset_index()

table.pivot(index='loan',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='loan and Success');

table=table.pivot(index='loan',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='loan',y='rate',figsize=(14,7),title='Loan and Success rate' );

# contact
# those with cellular have higher rate of success and volume
table=pd.DataFrame(bank.groupby(['contact','y']).count()['y_label']).reset_index()

table.pivot(index='contact',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='Contact and Success');
table=table.pivot(index='contact',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='contact',y='rate',figsize=(14,7),title='Contact and Success rate' );

# month of success
# the contacts are not  evenly spread across months
# the sales have seasonality. The top of the sales is on December, March and October.
# they are related with Christmast Hollidays, White Hollidays and the starting of the academic term

table=pd.DataFrame(bank.groupby(['month','y']).count()['y_label']).reset_index()

table.pivot(index='month',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='Month and Success');
table=table.pivot(index='month',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='month',y='rate',figsize=(14,7),title='Month and Success rate' );

# day of  week
# there are more chances of closing the sale from midweek to friday than in the earlier days of the week

table=pd.DataFrame(bank.groupby(['day_of_week','y']).count()['y_label']).reset_index()

table.pivot(index='day_of_week',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='day_of_week and Success');

table=table.pivot(index='day_of_week',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='day_of_week',y='rate',figsize=(14,7),title='Day of week and Success rate' );

# previous outcome
# it seems that new  customers and  those that already bought are more prone to buy that those that did not buy
table=pd.DataFrame(bank.groupby(['poutcome','y']).count()['y_label']).reset_index()

table.pivot(index='poutcome',columns='y',values='y_label').plot.bar(stacked=True,figsize=(14,7),
                                                                     title='poutcome and Success');
table=table.pivot(index='poutcome',columns='y',values='y_label')
table['rate']=table['yes']*100/(table['yes']+table['no'] )
table.reset_index(inplace=True)
table.plot.line(x='poutcome',y='rate',figsize=(14,7),title='Poutcome and Success rate' );

DATA PREPROCESING: ENCODING, SCALING¶

#lets encode the categorical variables to  use them in the models
# Select the numeric columns
numeric_subset = bank.select_dtypes('number')

categorical_subset = bank[['job', 'marital','education','default','housing','loan',
                           'contact','month','day_of_week','poutcome']]
# One hot encode
categorical_subset = pd.get_dummies(categorical_subset)

# Join the two dataframes using concat
# Make sure to use axis = 1 to perform a column bind
features = pd.concat([numeric_subset, categorical_subset], axis = 1)

features.shape

(41188, 64)

features.head()

PREPARING THE TRAIN AND TEST DATASETS¶

# Separating the features and the labels
# droping labels from features and duration
y=features[['y_label']]
features.drop('y_label', inplace=True, axis=1)
features.drop('duration', inplace=True, axis=1)

#splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size = 0.2, random_state = 101)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32950, 62)
(8238, 62)
(32950, 1)
(8238, 1)

# creating a base line
baseline_guess=round(y['y_label'].sum()*100/len(y['y_label']),2)
print('The baseline guess is a score of %0.2f' % baseline_guess)
q=100-baseline_guess
print("Baseline Performance on the test set: %0.2f" % q)

The baseline guess is a score of 11.00
Baseline Performance on the test set: 89.00

# We don´t have missing values to impute
# Making sure all values are finite, if not, imputing or dropping them
# Make sure all values are finite
print(np.where(~np.isfinite(X_train)))
print(np.where(~np.isfinite(X_test)))
# We don´t have infinite values

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))

SCALING THE FEATURES¶

# I am going to use different classifiers, and to compare them rightly, I need to normalize the features
# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(X_train)

# Transform both the training and testing data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

TRAINING AND TESTING DIFFERENT METHODS¶

LOGISTIC REGRESSION¶

# Lets test and train different machine learning Classifiers

# LOGISTIC REGRESION
logmodel = LogisticRegression() 
logmodel.fit(X_train,y_train)
logpred = logmodel.predict(X_test)

#Confussion  matrix
CM=confusion_matrix(y_test, logpred)
print(CM)

[[7188   91]
 [ 745  214]]

#round(metrics.precision_score(y_test, logpred),2)
#round(metrics.recall_score(y_test, logpred),2)
#round(metrics.f1_score(y_test, logpred),2)

#accuracy_score(y_test, logpred)
#classification_report(y_test, logpred)

confusion_df = pd.DataFrame(confusion_matrix(y_test, logpred),
             columns=["Predicted Class " + str(class_name) for class_name in [0,1]],
             index = ["Class " + str(class_name) for class_name in [0,1]])
confusion_df

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.2231
FNR_MISS_RATE = 0.7769
FPR_FALSE_ALARM = 0.0125
TNR_SPECIFICITY_SELECTIVITY = 0.9875
PRECISION = 0.7016
ACCURACY = 0.8985
F1_SCORE = 0.0847

#Accuracy
round(accuracy_score(y_test, logpred),4)*100

89.85

# Accuracy With cross_validation
cross_val_score(logmodel, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy')

array([0.90531108, 0.89377845, 0.89681335, 0.90925645, 0.89924127,
       0.907739  , 0.90106222, 0.8952959 , 0.89984825, 0.8952959 ])

# Mean Accuracy
Log_reg_CV = (cross_val_score(logmodel, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
Log_reg_CV

0.9003641881638845

KNeighbors¶

#Neighbors
neighbors = np.arange(0,25)

#Create empty list that will hold cv scores
cv_scores = []

#Perform 10-fold cross validation on training set for odd values of k:
for k in neighbors:
    k_value = k+1
    knn = KNeighborsClassifier(n_neighbors = k_value, weights='uniform', p=2, metric='euclidean')
    kfold = model_selection.KFold(n_splits=10, random_state=101)
    scores = model_selection.cross_val_score(knn, X_train, y_train, cv=kfold, scoring='accuracy')
    cv_scores.append(scores.mean()*100)
    print("k=%d %0.2f (+/- %0.2f)" % (k_value, scores.mean()*100, scores.std()*100))

optimal_k = neighbors[cv_scores.index(max(cv_scores))]
print ("The optimal number of neighbors is %d with %0.1f%%" % (optimal_k, cv_scores[optimal_k]))

plt.plot(neighbors, cv_scores)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Train Accuracy')
plt.show()

k=1 84.92 (+/- 0.41)
k=2 88.89 (+/- 0.48)
k=3 88.29 (+/- 0.36)
k=4 89.18 (+/- 0.30)
k=5 89.00 (+/- 0.33)
k=6 89.36 (+/- 0.36)
k=7 89.33 (+/- 0.45)
k=8 89.54 (+/- 0.40)
k=9 89.56 (+/- 0.48)
k=10 89.63 (+/- 0.46)
k=11 89.54 (+/- 0.48)
k=12 89.63 (+/- 0.50)
k=13 89.64 (+/- 0.43)
k=14 89.65 (+/- 0.40)
k=15 89.67 (+/- 0.43)
k=16 89.71 (+/- 0.42)
k=17 89.68 (+/- 0.41)
k=18 89.69 (+/- 0.44)
k=19 89.67 (+/- 0.47)
k=20 89.71 (+/- 0.48)
k=21 89.73 (+/- 0.49)
k=22 89.70 (+/- 0.48)
k=23 89.79 (+/- 0.48)
k=24 89.81 (+/- 0.47)
k=25 89.85 (+/- 0.50)
The optimal number of neighbors is 24 with 89.8%

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=24)
knn.fit(X_train, y_train)
knnpred = knn.predict(X_test)
CM=confusion_matrix(y_test, knnpred)
confusion_matrix(y_test, knnpred)
(round(accuracy_score(y_test, knnpred),2)*100)
K_neighbors_CV = (cross_val_score(knn, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())

#Confussion  matrix
CM

array([[7219,   60],
       [ 803,  156]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.1627
FNR_MISS_RATE = 0.8373
FPR_FALSE_ALARM = 0.0082
TNR_SPECIFICITY_SELECTIVITY = 0.9918
PRECISION = 0.7222
ACCURACY = 0.8952
F1_SCORE = 0.0664

# Mean Accuracy
K_neighbors_CV

0.8982094081942338

SUPPORT VECTORS MACHINE¶

from sklearn.svm import SVC
svc= SVC(kernel = 'sigmoid')
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
CM=confusion_matrix(y_test, svcpred)
(round(accuracy_score(y_test, svcpred),2)*100)
Support_Vectors_CV = (cross_val_score(svc, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
Support_Vectors_CV

0.897389984825493

#Confussion  matrix
CM

array([[7208,   71],
       [ 774,  185]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.1929
FNR_MISS_RATE = 0.8071
FPR_FALSE_ALARM = 0.0098
TNR_SPECIFICITY_SELECTIVITY = 0.9902
PRECISION = 0.7227
ACCURACY = 0.8974
F1_SCORE = 0.0761

# Mean Accuracy
Support_Vectors_CV

0.897389984825493

DECISSION TREE¶

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='gini') #criterion = entopy, gini
dtree.fit(X_train, y_train)
dtreepred = dtree.predict(X_test)

CM=confusion_matrix(y_test, dtreepred)
(round(accuracy_score(y_test, dtreepred),2)*100)
Decission_Tree_CV = (cross_val_score(dtree, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
Decission_Tree_CV

0.8436722306525037

#Confussion  matrix
CM

array([[6615,  664],
       [ 633,  326]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.3399
FNR_MISS_RATE = 0.6601
FPR_FALSE_ALARM = 0.0912
TNR_SPECIFICITY_SELECTIVITY = 0.9088
PRECISION = 0.3293
ACCURACY = 0.8426
F1_SCORE = 0.0836

# Mean Accuracy
Decission_Tree_CV

0.8436722306525037

RANDOM FOREST¶

#Random Forest

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 200)#criterion = entropy,gini
rfc.fit(X_train, y_train)
rfcpred = rfc.predict(X_test)

CM=confusion_matrix(y_test, rfcpred )
(round(accuracy_score(y_test, rfcpred),2)*100)
Random_Forest_CV = (cross_val_score(rfc, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
Random_Forest_CV

0.8932928679817908

#Confussion  matrix
CM

array([[7063,  216],
       [ 672,  287]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.2993
FNR_MISS_RATE = 0.7007
FPR_FALSE_ALARM = 0.0297
TNR_SPECIFICITY_SELECTIVITY = 0.9703
PRECISION = 0.5706
ACCURACY = 0.8922
F1_SCORE = 0.0982

# Mean Accuracy
Random_Forest_CV

0.8932928679817908

NAIVE BAYES¶

from sklearn.naive_bayes import GaussianNB
gaussiannb= GaussianNB()
gaussiannb.fit(X_train, y_train)
gaussiannbpred = gaussiannb.predict(X_test)
probs = gaussiannb.predict(X_test)

CM=confusion_matrix(y_test, gaussiannbpred )
(round(accuracy_score(y_test, gaussiannbpred),2)*100)
GAUSIAN_NAIVE_BAYES_CV = (cross_val_score(gaussiannb, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
GAUSIAN_NAIVE_BAYES_CV

0.7959332321699545

#Confussion  matrix
CM

array([[5999, 1280],
       [ 366,  593]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.6184
FNR_MISS_RATE = 0.3816
FPR_FALSE_ALARM = 0.1758
TNR_SPECIFICITY_SELECTIVITY = 0.8242
PRECISION = 0.3166
ACCURACY = 0.8002
F1_SCORE = 0.1047

# Mean Accuracy
GAUSIAN_NAIVE_BAYES_CV

0.7959332321699545

XGB Classifier¶

from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgbprd = xgb.predict(X_test)

CM=confusion_matrix(y_test, xgbprd )
round(accuracy_score(y_test, xgbprd),2)*100
XGB_CV = (cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10).mean())
XGB_CV

0.9009712955576571

#Confussion  matrix
CM

array([[7200,   79],
       [ 750,  209]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.2179
FNR_MISS_RATE = 0.7821
FPR_FALSE_ALARM = 0.0109
TNR_SPECIFICITY_SELECTIVITY = 0.9891
PRECISION = 0.7257
ACCURACY = 0.8994
F1_SCORE = 0.0838

# Mean Accuracy
XGB_CV

0.9009712955576571

Gradient Boosting Classifier¶

from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
gbkpred = gbk.predict(X_test)
CM=confusion_matrix(y_test, gbkpred )
round(accuracy_score(y_test, gbkpred),2)*100
Gradient_Boosting_KCV = (cross_val_score(gbk, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
Gradient_Boosting_KCV

0.9010622154779968

#Confussion  matrix
CM

array([[7184,   95],
       [ 730,  229]], dtype=int64)

#Calculating different metrics
TrueP=CM[1,1]
TrueN=CM[0,0]
FalseP=CM[0,1]
FalseN=CM[1,0]

TPR_RECALL_SENSITIVITY=TrueP*1.0/(TrueP+FalseN)
FNR_MISS_RATE=FalseN*1.0/(TrueP+FalseN)
FPR_FALSE_ALARM=FalseP*1.0/(FalseP+TrueN)
TNR_SPECIFICITY_SELECTIVITY=TrueN*1.0/(FalseP+TrueN)
PRECISION=TrueP*1.0/(TrueP+FalseP)
ACCURACY=(TrueP+TrueN)*1.0/(TrueP+TrueN+FalseP+FalseN)
F1_SCORE=1*1.0/((1/TPR_RECALL_SENSITIVITY)+(1/PRECISION))/2

print('TPR_RECALL_SENSITIVITY = %0.4f' % TPR_RECALL_SENSITIVITY)
print('FNR_MISS_RATE = %0.4f' % FNR_MISS_RATE)
print('FPR_FALSE_ALARM = %0.4f' % FPR_FALSE_ALARM)
print('TNR_SPECIFICITY_SELECTIVITY = %0.4f' % TNR_SPECIFICITY_SELECTIVITY)
print('PRECISION = %0.4f' % PRECISION)
print('ACCURACY = %0.4f' % ACCURACY)
print('F1_SCORE = %0.4f' % F1_SCORE)

TPR_RECALL_SENSITIVITY = 0.2388
FNR_MISS_RATE = 0.7612
FPR_FALSE_ALARM = 0.0131
TNR_SPECIFICITY_SELECTIVITY = 0.9869
PRECISION = 0.7068
ACCURACY = 0.8999
F1_SCORE = 0.0892

# Mean Accuracy
Gradient_Boosting_KCV

0.9010622154779968

MODELS COMPARISON¶

# MODELS COMPARISON
models = pd.DataFrame({
                'Models': ['Random Forest Classifier', 'Decision Tree Classifier', 'Support Vector Machine',
                           'K-Near Neighbors', 'Logistic Model', 'Gausian NB', 'XGBoost', 'Gradient Boosting'],
                'Score':  [Random_Forest_CV, Decission_Tree_CV, Support_Vectors_CV, K_neighbors_CV,
                           Log_reg_CV, GAUSIAN_NAIVE_BAYES_CV, XGB_CV, Gradient_Boosting_KCV]})

models.sort_values(by='Score', ascending=False)

models.set_index('Models',inplace=True)
models.plot.barh(title='MACHINE LEARNING MODEL CLASSIFIERS ACCURACY SCORE', figsize=(16,6), legend= False);

RECEIVER OPERATING CHARACTERISTIC (ROC) AND AREA UNDER THE CURVE (AUC)¶

# AREA UNDER THE CURVE
# XGBOOST ROC/ AUC , BEST MODEL
from sklearn import metrics
fig, (ax, ax1) = plt.subplots(nrows = 1, ncols = 2, figsize = (15,5))
probs = xgb.predict_proba(X_test)
preds = probs[:,1]
fprxgb, tprxgb, thresholdxgb = metrics.roc_curve(y_test, preds)
roc_aucxgb = metrics.auc(fprxgb, tprxgb)

ax.plot(fprxgb, tprxgb, 'b', label = 'AUC = %0.2f' % roc_aucxgb)
ax.plot([0, 1], [0, 1],'r--')
ax.set_title('Receiver Operating Characteristic XGBOOST ',fontsize=10)
ax.set_ylabel('True Positive Rate',fontsize=20)
ax.set_xlabel('False Positive Rate',fontsize=15)
ax.legend(loc = 'lower right', prop={'size': 16})

#Gradient
probs = gbk.predict_proba(X_test)
preds = probs[:,1]
fprgbk, tprgbk, thresholdgbk = metrics.roc_curve(y_test, preds)
roc_aucgbk = metrics.auc(fprgbk, tprgbk)

ax1.plot(fprgbk, tprgbk, 'b', label = 'AUC = %0.2f' % roc_aucgbk)
ax1.plot([0, 1], [0, 1],'r--')
ax1.set_title('Receiver Operating Characteristic GRADIENT BOOST ',fontsize=10)
ax1.set_ylabel('True Positive Rate',fontsize=20)
ax1.set_xlabel('False Positive Rate',fontsize=15)
ax1.legend(loc = 'lower right', prop={'size': 16})

plt.subplots_adjust(wspace=1)

ROC AND AUC : REST OF THE MODELS¶

#fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(nrows = 2, ncols = 3, figsize = (15, 4))
fig, ax_arr = plt.subplots(nrows = 2, ncols = 3, figsize = (20,15))

#LOGMODEL
probs = logmodel.predict_proba(X_test)
preds = probs[:,1]
fprlog, tprlog, thresholdlog = metrics.roc_curve(y_test, preds)
roc_auclog = metrics.auc(fprlog, tprlog)

ax_arr[0,0].plot(fprlog, tprlog, 'b', label = 'AUC = %0.2f' % roc_auclog)
ax_arr[0,0].plot([0, 1], [0, 1],'r--')
ax_arr[0,0].set_title('Receiver Operating Characteristic Logistic ',fontsize=20)
ax_arr[0,0].set_ylabel('True Positive Rate',fontsize=20)
ax_arr[0,0].set_xlabel('False Positive Rate',fontsize=15)
ax_arr[0,0].legend(loc = 'lower right', prop={'size': 16})

#RANDOM FOREST --------------------
probs = rfc.predict_proba(X_test)
preds = probs[:,1]
fprrfc, tprrfc, thresholdrfc = metrics.roc_curve(y_test, preds)
roc_aucrfc = metrics.auc(fprrfc, tprrfc)

ax_arr[0,1].plot(fprrfc, tprrfc, 'b', label = 'AUC = %0.2f' % roc_aucrfc)
ax_arr[0,1].plot([0, 1], [0, 1],'r--')
ax_arr[0,1].set_title('Receiver Operating Characteristic Random Forest ',fontsize=20)
ax_arr[0,1].set_ylabel('True Positive Rate',fontsize=20)
ax_arr[0,1].set_xlabel('False Positive Rate',fontsize=15)
ax_arr[0,1].legend(loc = 'lower right', prop={'size': 16})

#KNN----------------------
probs = knn.predict_proba(X_test)
preds = probs[:,1]
fprknn, tprknn, thresholdknn = metrics.roc_curve(y_test, preds)
roc_aucknn = metrics.auc(fprknn, tprknn)

ax_arr[0,2].plot(fprknn, tprknn, 'b', label = 'AUC = %0.2f' % roc_aucknn)
ax_arr[0,2].plot([0, 1], [0, 1],'r--')
ax_arr[0,2].set_title('Receiver Operating Characteristic KNN ',fontsize=20)
ax_arr[0,2].set_ylabel('True Positive Rate',fontsize=20)
ax_arr[0,2].set_xlabel('False Positive Rate',fontsize=15)
ax_arr[0,2].legend(loc = 'lower right', prop={'size': 16})

#DECISION TREE ---------------------
probs = dtree.predict_proba(X_test)
preds = probs[:,1]
fprdtree, tprdtree, thresholddtree = metrics.roc_curve(y_test, preds)
roc_aucdtree = metrics.auc(fprdtree, tprdtree)

ax_arr[1,0].plot(fprdtree, tprdtree, 'b', label = 'AUC = %0.2f' % roc_aucdtree)
ax_arr[1,0].plot([0, 1], [0, 1],'r--')
ax_arr[1,0].set_title('Receiver Operating Characteristic Decision Tree ',fontsize=20)
ax_arr[1,0].set_ylabel('True Positive Rate',fontsize=20)
ax_arr[1,0].set_xlabel('False Positive Rate',fontsize=15)
ax_arr[1,0].legend(loc = 'lower right', prop={'size': 16})

#GAUSSIAN ---------------------
probs = gaussiannb.predict_proba(X_test)
preds = probs[:,1]
fprgau, tprgau, thresholdgau = metrics.roc_curve(y_test, preds)
roc_aucgau = metrics.auc(fprgau, tprgau)

ax_arr[1,1].plot(fprgau, tprgau, 'b', label = 'AUC = %0.2f' % roc_aucgau)
ax_arr[1,1].plot([0, 1], [0, 1],'r--')
ax_arr[1,1].set_title('Receiver Operating Characteristic Gaussian ',fontsize=20)
ax_arr[1,1].set_ylabel('True Positive Rate',fontsize=20)
ax_arr[1,1].set_xlabel('False Positive Rate',fontsize=15)
ax_arr[1,1].legend(loc = 'lower right', prop={'size': 16})

#ALL PLOTS ----------------------------------
ax_arr[1,2].plot(fprgau, tprgau, 'b', label = 'Gaussian', color='black')
ax_arr[1,2].plot(fprdtree, tprdtree, 'b', label = 'Decision Tree', color='blue')
ax_arr[1,2].plot(fprknn, tprknn, 'b', label = 'Knn', color='brown')
ax_arr[1,2].plot(fprrfc, tprrfc, 'b', label = 'Random Forest', color='green')
ax_arr[1,2].plot(fprlog, tprlog, 'b', label = 'Logistic', color='grey')
ax_arr[1,2].set_title('Receiver Operating Comparison ',fontsize=20)
ax_arr[1,2].set_ylabel('True Positive Rate',fontsize=20)
ax_arr[1,2].set_xlabel('False Positive Rate',fontsize=15)
ax_arr[1,2].legend(loc = 'lower right', prop={'size': 16})

plt.subplots_adjust(wspace=0.2)
plt.tight_layout()

INTERPRETATION OF RESULTS¶

The best model was Gradient Boosting classifier. I have managed to create a model that can predict the success of a marketing lead when it comes to sell a loan with a 90 % of accuracy. Being able to qualify leads is key in a cost intense activity such as telephone marking, that even can annoy customers that are not interested in the value proposition offered. The model as it is, seems like a black box, but i am going to interpret the results through features importance.

# Extract the feature importances into a dataframe
feature_results = pd.DataFrame({'feature': list(features.columns), 
                                'importance': gbk.feature_importances_})

# Show the top 10 most important
feature_results = feature_results.sort_values('importance', ascending = False).reset_index(drop=True)

feature_results.head(10)

# The euribor3m, nr.employed, age, cons.conf.idx and pdays are the 
# best predictors to know if we are going to sell the loan
# Thee truth is that we should repeat the analysis eliminating nr.employed, because this variable should not take
# into consideration for the leads qualification.
# The more the employees we have, the more work on telemarketig and the more wins we have.
# Nevertheless i will continue the analysis as it is.

#Lets graph the features importance
#figsize(12, 10)
plt.style.use('fivethirtyeight')

# Plot the 10 most important features in a horizontal bar chart
feature_results.loc[:9, :].plot(x = 'feature', y = 'importance', 
                                 edgecolor = 'k',
                                 kind='barh', color = 'blue',figsize=(12, 10) );
plt.xlabel('Relative Importance', size = 20); plt.ylabel('')
plt.title('Feature Importances from GBK', size = 30);

#although this model uses many decision tres to take decisions, we can explore a single tree to know how some part
# of the model works

# Extract a single tree
single_tree = gbk.estimators_[1][0]
tree.export_graphviz(single_tree, out_file = 'tree.dot',
                     rounded = True, 
                     feature_names = list(features.columns),
                     filled = True)

single_tree

DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort='auto',
           random_state=<mtrand.RandomState object at 0x0000000006C56630>,
           splitter='best')

#Using the graphical tool
# https://dreampuf.github.io
# we obtain the first decission tree of our ensemble method.
# as I have said before, we have to repeat te calculations excluding the number of employees.
# it is clear that this variable should not be taken into acccount if we want to create a model to qualify the customers
# The more employees we have, the more are dedicating to contacting, and the more wins we have.
# We should repeat the process but excluding this variable.

# In the chart we can see one of the 100 decission trees that we have used in our method.
# When interpreting the chart, we must take into account that we have scaled our data using mix max scaler.

from IPython.display import Image
Image("decision_tree.png")

	age	duration	campaign	…	cons.conf.idx	euribor3m	nr.employed
count	41188.00000	41188.000000	41188.000000	…	41188.000000	41188.000000	41188.000000
mean	40.02406	258.285010	2.567593	…	-40.502600	3.621291	5167.035911
std	10.42125	259.279249	2.770014	…	4.628198	1.734447	72.251528
min	17.00000	0.000000	1.000000	…	-50.800000	0.634000	4963.600000
25%	32.00000	102.000000	1.000000	…	-42.700000	1.344000	5099.100000
50%	38.00000	180.000000	2.000000	…	-41.800000	4.857000	5191.000000
75%	47.00000	319.000000	3.000000	…	-36.400000	4.961000	5228.100000
max	98.00000	4918.000000	56.000000	…	-26.900000	5.045000	5228.100000

	feature	importance
0	euribor3m	0.211672
1	nr.employed	0.139916
2	age	0.114080
3	cons.conf.idx	0.076407
4	pdays	0.064454
5	campaign	0.043023
6	month_apr	0.030890
7	previous	0.027782
8	cons.price.idx	0.024283
9	poutcome_failure	0.023997

	age	job	marital	…	euribor3m	nr.employed	y
0	56	housemaid	married	…	4.857	5191.0	no
1	57	services	married	…	4.857	5191.0	no
2	37	services	married	…	4.857	5191.0	no
3	40	admin.	married	…	4.857	5191.0	no
4	56	services	married	…	4.857	5191.0	no

	age	duration	campaign	…	poutcome_nonexistent
0	56	261	1	…	1
1	57	149	1	…	1
2	37	226	1	…	1
3	40	151	1	…	1
4	56	307	1	…	1

	Models	Score
7	Gradient Boosting	0.901062
6	XGBoost	0.900971
4	Logistic Model	0.900364
3	K-Near Neighbors	0.898209
2	Support Vector Machine	0.897390
0	Random Forest Classifier	0.893293
1	Decision Tree Classifier	0.843672
5	Gausian NB	0.795933

	age	duration	campaign	…	poutcome_nonexistent
0	56	261	1	…	1
1	57	149	1	…	1
2	37	226	1	…	1
3	40	151	1	…	1
4	56	307	1	…	1