### DataFrame
import pandas as pd

### For working with arrays.
import numpy as np

### Functions creating iterators for efficient looping.
import itertools

### Creating visualizations.
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

### Label Encoding
from sklearn.preprocessing import LabelEncoder

### Hyperparameter Tuning
from sklearn.linear_model import LogisticRegression # Dependent variable is binary
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

### ROC curves
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

### Precision-Recall curve and F1 score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### Confusion Matrix
from sklearn.metrics import confusion_matrix


df = pd.read_csv("Loan Prediction Dataset.csv")
df.head()


df.describe()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# find the null values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


# Fill the missing values for numerical terms - mean.
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())


# Fill the missing values for categorical terms - mode
df['Gender'] = df["Gender"].fillna(df['Gender'].mode()[0])
df['Married'] = df["Married"].fillna(df['Married'].mode()[0])
df['Dependents'] = df["Dependents"].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df["Self_Employed"].fillna(df['Self_Employed'].mode()[0])


df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


# categorical attributes visualization
sns.countplot(df['Gender'])

<AxesSubplot:xlabel='Gender', ylabel='count'>


sns.countplot(df['Married'])

<AxesSubplot:xlabel='Married', ylabel='count'>


sns.countplot(df['Dependents'])

<AxesSubplot:xlabel='Dependents', ylabel='count'>


sns.countplot(df['Education'])

<AxesSubplot:xlabel='Education', ylabel='count'>


sns.countplot(df['Self_Employed'])

<AxesSubplot:xlabel='Self_Employed', ylabel='count'>


sns.countplot(df['Property_Area'])

<AxesSubplot:xlabel='Property_Area', ylabel='count'>


sns.countplot(df['Loan_Status'])

<AxesSubplot:xlabel='Loan_Status', ylabel='count'>


# numerical attributes visualization
sns.distplot(df["ApplicantIncome"])

<AxesSubplot:xlabel='ApplicantIncome', ylabel='Density'>


sns.distplot(df["CoapplicantIncome"])

<AxesSubplot:xlabel='CoapplicantIncome', ylabel='Density'>


sns.distplot(df["LoanAmount"])

<AxesSubplot:xlabel='LoanAmount', ylabel='Density'>


sns.distplot(df['Loan_Amount_Term'])

<AxesSubplot:xlabel='Loan_Amount_Term', ylabel='Density'>


sns.distplot(df['Credit_History'])
# The value has already in a range of 0 to 1.
# Will not apply Log Transformation.

<AxesSubplot:xlabel='Credit_History', ylabel='Density'>


# Total income
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.head()


# apply log transformation to the attribute
df['ApplicantIncomeLog'] = np.log(df['ApplicantIncome'])
sns.distplot(df["ApplicantIncomeLog"])
# ApplicationIncomeLog ranges between 5 to 12.

<AxesSubplot:xlabel='ApplicantIncomeLog', ylabel='Density'>


df['CoapplicantIncomeLog'] = np.log(df['CoapplicantIncome'])
sns.distplot(df["ApplicantIncomeLog"])

<AxesSubplot:xlabel='ApplicantIncomeLog', ylabel='Density'>


df['LoanAmountLog'] = np.log(df['LoanAmount'])
sns.distplot(df["LoanAmountLog"])

<AxesSubplot:xlabel='LoanAmountLog', ylabel='Density'>


df['Loan_Amount_Term_Log'] = np.log(df['Loan_Amount_Term'])
sns.distplot(df["Loan_Amount_Term_Log"])
# Most of the values are at 6.
# But the scale is changed from 0 to 500 to 0 to 6 which is still better than before log transformation.

<AxesSubplot:xlabel='Loan_Amount_Term_Log', ylabel='Density'>


df['Total_Income_Log'] = np.log(df['Total_Income'])
sns.distplot(df["Total_Income_Log"])

<AxesSubplot:xlabel='Total_Income_Log', ylabel='Density'>


df.head()


corr = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="Greens")

<AxesSubplot:>


df_all = df


# drop unnecessary columns
cols = ['ApplicantIncome', 'CoapplicantIncome', "LoanAmount", "Loan_Amount_Term", "Total_Income", 'Loan_ID', 'CoapplicantIncomeLog']
# drop 'CoapplicantIncomeLog' because it has infinitive value.
df = df.drop(columns=cols, axis=1)
# axis=1 drops the column entirely.
df


# Select attributes which are not applied logarithm to
df_nonlog = df_all[['Gender','Married','Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', "LoanAmount", "Loan_Amount_Term", "Credit_History", "Property_Area", "Loan_Status", "Total_Income"]]
df_nonlog


cols = ['Gender', "Married","Dependents", "Education",'Self_Employed',"Property_Area","Loan_Status"]
# initialize the LabelEncoder
le = LabelEncoder()

for col in cols:
    # for Logistic Regression
    df[col] = le.fit_transform(df[col])
    # for Decision Tree and Random Forest
    df_nonlog[col] = le.fit_transform(df_nonlog[col])


df.head()


df.shape

(614, 12)


df_nonlog.head()


df_nonlog.shape

(614, 12)


# Logistic Regression
# specify input and output attributes
X = df.drop(columns=['Loan_Status'], axis=1)
# drop the Loan_Status from independent columns
y = df['Loan_Status']


# Decision Tree and Random Forest
# specify input and output attributes
X_nonlog = df_nonlog.drop(columns=['Loan_Status'], axis=1)
# drop the Loan_Status from independent columns
y_nonlog = df_nonlog['Loan_Status']


def Evaluate_train_test_data(model, X, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=2)
    model.fit(x_train, y_train)
    # I want to compare the cross validation accuracy against the test data(which is only 10%) accuracy; hence I chose
    # cv=9 (9 folds)
    score = cross_val_score(model, x_train, y_train, cv=9)
    
    test_data_accuracy = model.score(x_test, y_test)*100
    cross_validation_training_data_accuracy = np.mean(score)*100

    return test_data_accuracy, cross_validation_training_data_accuracy, x_train, x_test, y_train, y_test


def ROC_curve(model, x_train, x_test, y_train, y_test, plot):
    # generate a no skill prediction (majority class)
    # My majority class here is "Loan_Status = 1"
    ns_probs = [1 for _ in range(len(y_test))]
        
    # predict probabilities
    y_probs = model.predict_proba(x_test)
    
    # keep probabilities for the positive outcome only
    y_probs = y_probs[:, 1]
    
    # calculate roc-auc scores for no-skill classifier and the trained model
    ns_auc = roc_auc_score(y_test, ns_probs)
    ROC_auc = roc_auc_score(y_test, y_probs)
    
    ROC_best_parameters_result = ()

    if plot is True:
        # calculate roc curves
        ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
        ROC_fpr, ROC_tpr, _ = roc_curve(y_test, y_probs)

        ROC_best_parameters_result = (model.__class__.__name__, ROC_fpr, ROC_tpr, ROC_auc)

    return ROC_auc, ROC_best_parameters_result


def Precision_Recall_curve(model, x_train, x_test, y_train, y_test):
    # predict probabilities
    y_probs = model.predict_proba(x_test)
    # keep probabilities for the positive outcome only
    y_probs = y_probs[:, 1]
    # predict class values
    y_test_pred = model.predict(x_test)
    pr_precision, pr_recall, _ = precision_recall_curve(y_test, y_probs)
    # calculate f1 score and Precision-Recall auc
    pr_f1, pr_auc = f1_score(y_test, y_test_pred), auc(pr_recall, pr_precision)

    # calculate the [true positive rate] precision/recall for no skill model
    no_skill = len(y_test[y_test==1]) / len(y_test)
    
    # get precision score
    pr_score_of_precision = precision_score(y_test, y_test_pred)
    # get recall score
    pr_score_of_recall = recall_score(y_test, y_test_pred)

    return pr_f1, pr_auc, pr_score_of_precision, pr_score_of_recall


# Hyperparameters
C = [100, 10, 1.0, 0.1, 0.01]
penalty = ['none', 'l1', 'l2', 'elasticnet']
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

LR_params = [C, penalty, solver]
LR_params = list(itertools.product(*LR_params))

column_params = ["C", "penalty", "solver"]
Accuracy_params = ["Test Data Accuracy", "Cross Validation (Training) Data Accuracy"]
ROC_params = ["ROC_AUC"]
Precision_recall_params = ["PR_F1", 'PR_AUC', "PR_Precision_score", "PR_Recall_score"]
All_scores_params = Accuracy_params+ROC_params+Precision_recall_params+column_params

LR_ValueError = pd.DataFrame(columns = column_params)
LR_Accuracy = pd.DataFrame(columns = All_scores_params)

plot = False

for i in range(len(LR_params)):
    try:
        # random_state is used when solver == ‘sag’, ‘saga’ or ‘liblinear’ to shuffle the data.
        model = LogisticRegression(C=LR_params[i][0], 
                                   penalty=LR_params[i][1], 
                                   solver=LR_params[i][2], 
                                   random_state = 2) 
        
        test_data_accuracy, cross_validation_training_data_accuracy, x_train, x_test, y_train, y_test = Evaluate_train_test_data(model, X, y)
        
        # Call ROC_curve
        roc_auc, _ = ROC_curve(model, x_train, x_test, y_train, y_test, plot)

        # Call Precision_Recall_curve
        pr_f1, pr_auc, pr_score_of_precision, pr_score_of_recall = Precision_Recall_curve(model, x_train, x_test, y_train, y_test)

        lis_Accuracy = [test_data_accuracy, 
                        cross_validation_training_data_accuracy,
                        roc_auc,
                        pr_f1,
                        pr_auc, 
                        pr_score_of_precision, 
                        pr_score_of_recall,
                        LR_params[i][0],
                        LR_params[i][1],
                        LR_params[i][2]
                       ]
        
        LR_Accuracy = LR_Accuracy.append(pd.DataFrame([lis_Accuracy], columns=All_scores_params), ignore_index=True)
        
    except:
        
        ValueError_params = [LR_params[i][0],
                             LR_params[i][1],
                             LR_params[i][2]
                            ]
        
        LR_ValueError = LR_ValueError.append(pd.DataFrame([ValueError_params], columns=column_params), ignore_index=True)


# Hyperparameters
criterion = ['gini', 'entropy']
max_depth = list(range(1,5))
min_samples_split = list(range(2,4))
min_samples_leaf = list(range(1,5))

DT_params = [criterion, max_depth, min_samples_split, min_samples_leaf]
DT_params = list(itertools.product(*DT_params))

column_params = ["criterion", "max_depth", "min_samples_split", "min_samples_leaf"]
Accuracy_params = ["Test Data Accuracy", "Cross Validation (Training) Data Accuracy"]
ROC_params = ["ROC_AUC"]
Precision_recall_params = ["PR_F1", 'PR_AUC', "PR_Precision_score", "PR_Recall_score"]
All_scores_params = Accuracy_params+ROC_params+Precision_recall_params+column_params

DT_ValueError = pd.DataFrame(columns = column_params)
DT_Accuracy = pd.DataFrame(columns = All_scores_params)

plot = False

for i in range(len(DT_params)):
    try:
        
        # Set a random_state in order to get consistent results.
        model = DecisionTreeClassifier(criterion=DT_params[i][0], 
                                       max_depth=DT_params[i][1], 
                                       min_samples_split=DT_params[i][2], 
                                       min_samples_leaf= DT_params[i][3], 
                                       random_state = 2)
        
        test_data_accuracy, cross_validation_training_data_accuracy, x_train, x_test, y_train, y_test = Evaluate_train_test_data(model, X_nonlog, y_nonlog)
        
        # Call ROC_curve
        roc_auc, _ = ROC_curve(model, x_train, x_test, y_train, y_test, plot)

        # Call Precision_Recall_curve
        pr_f1, pr_auc, pr_score_of_precision, pr_score_of_recall = Precision_Recall_curve(model, x_train, x_test, y_train, y_test)

        lis_Accuracy = [test_data_accuracy, 
                        cross_validation_training_data_accuracy,
                        roc_auc,
                        pr_f1,
                        pr_auc, 
                        pr_score_of_precision, 
                        pr_score_of_recall,
                        DT_params[i][0],
                        DT_params[i][1],
                        DT_params[i][2],
                        DT_params[i][3]
                       ]
        
        DT_Accuracy = DT_Accuracy.append(pd.DataFrame([lis_Accuracy], columns=All_scores_params), ignore_index=True)
        
    except:
        
        ValueError_params = [DT_params[i][0],
                        DT_params[i][1],
                        DT_params[i][2],
                        DT_params[i][3]
                            ]
        
        DT_ValueError = DT_ValueError.append(pd.DataFrame([ValueError_params], columns=column_params), ignore_index=True)


# Hyperparameters
n_estimators = [100, 200, 500]
criterion = ['gini', 'entropy']
max_features = ['auto','sqrt','log2']  
max_depth = [5]
min_samples_split = [30, 60]    # [0.1*len(x_train)]
min_samples_leaf = [10, 20]  # do not overfit the model

RF_params = [n_estimators, criterion, max_features, max_depth, min_samples_split, min_samples_leaf]
RF_params = list(itertools.product(*RF_params))

column_params = ["n_estimators", "criterion", "max_features", "max_depth", "min_samples_split", "min_samples_leaf"]
Accuracy_params = ["Test Data Accuracy", "Cross Validation (Training) Data Accuracy"]
ROC_params = ["ROC_AUC"]
Precision_recall_params = ["PR_F1", 'PR_AUC', "PR_Precision_score", "PR_Recall_score"]
All_scores_params = Accuracy_params+ROC_params+Precision_recall_params+column_params

RF_ValueError = pd.DataFrame(columns = column_params)
RF_Accuracy = pd.DataFrame(columns = All_scores_params)

plot = False

for i in range(len(RF_params)):
    try:
        
        model = RandomForestClassifier(n_estimators=RF_params[i][0], 
                                       criterion=RF_params[i][1], 
                                       max_features=RF_params[i][2], 
                                       max_depth=RF_params[i][3], 
                                       min_samples_split=RF_params[i][4], 
                                       min_samples_leaf=RF_params[i][5], 
                                       random_state = 2)
        
        test_data_accuracy, cross_validation_training_data_accuracy, x_train, x_test, y_train, y_test = Evaluate_train_test_data(model, X_nonlog, y_nonlog)
        
        # Call ROC_curve
        roc_auc, _ = ROC_curve(model, x_train, x_test, y_train, y_test, plot)

        # Call Precision_Recall_curve
        pr_f1, pr_auc, pr_score_of_precision, pr_score_of_recall = Precision_Recall_curve(model, x_train, x_test, y_train, y_test)

        lis_Accuracy = [test_data_accuracy, 
                        cross_validation_training_data_accuracy,
                        roc_auc,
                        pr_f1,
                        pr_auc, 
                        pr_score_of_precision, 
                        pr_score_of_recall,
                        RF_params[i][0],
                        RF_params[i][1],
                        RF_params[i][2],
                        RF_params[i][3],
                        RF_params[i][4],
                        RF_params[i][5]
                       ]
        
        RF_Accuracy = RF_Accuracy.append(pd.DataFrame([lis_Accuracy], columns=All_scores_params), ignore_index=True)
        
    except:
        
        ValueError_params = [RF_params[i][0],
                             RF_params[i][1],
                             RF_params[i][2],
                             RF_params[i][3],
                             RF_params[i][4],
                             RF_params[i][5]
                            ]
        
        RF_ValueError = RF_ValueError.append(pd.DataFrame([ValueError_params], columns=column_params), ignore_index=True)


# Logistic Regression
LR_Best_Parameters = LR_Accuracy.loc[LR_Accuracy['ROC_AUC'].idxmax()]

LR_Best_model = LogisticRegression(C=LR_Best_Parameters['C'], 
                                   penalty=LR_Best_Parameters['penalty'], 
                                   solver=LR_Best_Parameters['solver'],
                                   random_state = 2)
LR_Best_test_data_accuracy, LR_Best_cross_validation_training_data_accuracy, LR_Best_x_train, LR_Best_x_test, LR_Best_y_train, LR_Best_y_test = Evaluate_train_test_data(LR_Best_model, X, y)

# result ready for plotting ROC curve (best parameter set)
plot = True
_, LR_best_result = ROC_curve(LR_Best_model, LR_Best_x_train, LR_Best_x_test, LR_Best_y_train, LR_Best_y_test, plot)
LR_Best_Parameters

Test Data Accuracy                             82.2581
Cross Validation (Training) Data Accuracy      80.6246
ROC_AUC                                       0.739583
PR_F1                                         0.893204
PR_AUC                                         0.89425
PR_Precision_score                            0.836364
PR_Recall_score                               0.958333
C                                                    1
penalty                                             l1
solver                                       liblinear
Name: 26, dtype: object


# Decision Tree
DT_Best_Parameters = DT_Accuracy.loc[DT_Accuracy['ROC_AUC'].idxmax()]

DT_Best_model = DecisionTreeClassifier(criterion=DT_Best_Parameters['criterion'], 
                                       max_depth=DT_Best_Parameters['max_depth'], 
                                       min_samples_split=DT_Best_Parameters['min_samples_split'], 
                                       min_samples_leaf=DT_Best_Parameters['min_samples_leaf'],
                                       random_state = 2)
DT_Best_test_data_accuracy, DT_Best_cross_validation_training_data_accuracy, DT_Best_x_train, DT_Best_x_test, DT_Best_y_train, DT_Best_y_test = Evaluate_train_test_data(DT_Best_model, X_nonlog, y_nonlog)

# result ready for plotting ROC curve (best parameter set)
plot = True
_, DT_best_result = ROC_curve(DT_Best_model, DT_Best_x_train, DT_Best_x_test, DT_Best_y_train, DT_Best_y_test, plot)
DT_Best_Parameters

Test Data Accuracy                             83.871
Cross Validation (Training) Data Accuracy     80.2691
ROC_AUC                                      0.747024
PR_F1                                        0.901961
PR_AUC                                       0.918675
PR_Precision_score                           0.851852
PR_Recall_score                              0.958333
criterion                                     entropy
max_depth                                           4
min_samples_split                                   2
min_samples_leaf                                    1
Name: 56, dtype: object


# Random Forest
RF_Best_Parameters = RF_Accuracy.loc[RF_Accuracy['ROC_AUC'].idxmax()]

RF_Best_model = RandomForestClassifier(n_estimators=RF_Best_Parameters['n_estimators'], 
                                       criterion=RF_Best_Parameters['criterion'], 
                                       max_features=RF_Best_Parameters['max_features'], 
                                       max_depth=RF_Best_Parameters['max_depth'], 
                                       min_samples_split=RF_Best_Parameters['min_samples_split'], 
                                       min_samples_leaf=RF_Best_Parameters['min_samples_leaf'],
                                       random_state = 2)
RF_Best_test_data_accuracy, RF_Best_cross_validation_training_data_accuracy, RF_Best_x_train, RF_Best_x_test, RF_Best_y_train, RF_Best_y_test = Evaluate_train_test_data(RF_Best_model, X_nonlog, y_nonlog)

# result ready for plotting ROC curve (best parameter set)
plot = True
_, RF_best_result = ROC_curve(RF_Best_model, RF_Best_x_train, RF_Best_x_test, RF_Best_y_train, RF_Best_y_test, plot)
RF_Best_Parameters

Test Data Accuracy                             83.871
Cross Validation (Training) Data Accuracy     80.6246
ROC_AUC                                      0.755952
PR_F1                                        0.903846
PR_AUC                                       0.904221
PR_Precision_score                           0.839286
PR_Recall_score                              0.979167
n_estimators                                      100
criterion                                        gini
max_features                                     auto
max_depth                                           5
min_samples_split                                  30
min_samples_leaf                                   20
Name: 1, dtype: object


# Create a result_table to record the fpr, tpr, ROC_AUC for plotting ROC curves
best_result_table = pd.DataFrame([LR_best_result, 
                                  DT_best_result, 
                                  RF_best_result],
                                 columns = ["model", "fpr", "tpr", "ROC_AUC"])
best_result_table


fig = plt.figure(figsize=(8,6))

for i in best_result_table.index:
    plt.plot(best_result_table.loc[i]['fpr'],
             best_result_table.loc[i]['tpr'],
             label="{}, ROC_AUC={:.3f}".format(best_result_table.loc[i]['model'], best_result_table.loc[i]['ROC_AUC'],)
            )
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title("ROC Curves", fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()


with pd.ExcelWriter('results.xlsx') as writer:
    LR_ValueError.to_excel(writer, index = False, sheet_name='LR_ValueError')
    LR_Accuracy.to_excel(writer, index = False, sheet_name='LR_Accuracy')
    DT_ValueError.to_excel(writer, index = False, sheet_name='DT_ValueError')
    DT_Accuracy.to_excel(writer, index = False, sheet_name='DT_Accuracy')
    RF_ValueError.to_excel(writer, index = False, sheet_name='RF_ValueError')
    RF_Accuracy.to_excel(writer, index = False, sheet_name='RF_Accuracy')
    
    LR_Best_Parameters.to_excel(writer, index = False, sheet_name='LR_Best_Parameters')
    DT_Best_Parameters.to_excel(writer, index = False, sheet_name='DT_Best_Parameters')
    RF_Best_Parameters.to_excel(writer, index = False, sheet_name='RF_Best_Parameters')


# Logistic Regression Confusion Matrix (best parameter set)

LR_Best_y_pred = LR_Best_model.predict(LR_Best_x_test)
cm = confusion_matrix(LR_Best_y_test, LR_Best_y_pred)
print(cm)

sns.heatmap(cm, annot=True, cmap="Greens")

[[ 5  9]
 [ 2 46]]

<AxesSubplot:>


# Decision Tree Confusion Matrix (best parameter set)

DT_Best_y_pred = DT_Best_model.predict(DT_Best_x_test)
cm = confusion_matrix(DT_Best_y_test, DT_Best_y_pred)
print(cm)

sns.heatmap(cm, annot=True, cmap="Greens")

[[ 6  8]
 [ 2 46]]

<AxesSubplot:>


# Random Forest Confusion Matrix (best parameter set)

RF_Best_y_pred = RF_Best_model.predict(RF_Best_x_test)
cm = confusion_matrix(RF_Best_y_test, RF_Best_y_pred)
print(cm)

sns.heatmap(cm, annot=True, cmap="Greens")

[[ 5  9]
 [ 1 47]]

<AxesSubplot:>

Variable	Description
Loan_ID	Unique Loan ID
Gender	Male/ Female
Married	Applicant married (Y/N)
Dependents	Number of dependents
Education	Applicant Education (Graduate/ Under Graduate)
Self_Employed	Self employed (Y/N)
ApplicantIncome	Applicant income
CoapplicantIncome	Coapplicant income
LoanAmount	Loan amount in thousands
Loan_Amount_Term	Term of loan in months
Credit_History	credit history meets guidelines
Property_Area	Urban/ Semi Urban/ Rural
Loan_Status	Loan approved (Y/N)

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	NaN	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y

	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History
count	614.000000	614.000000	592.000000	600.00000	564.000000
mean	5403.459283	1621.245798	146.412162	342.00000	0.842199
std	6109.041673	2926.248369	85.587325	65.12041	0.364878
min	150.000000	0.000000	9.000000	12.00000	0.000000
25%	2877.500000	0.000000	100.000000	360.00000	1.000000
50%	3812.500000	1188.500000	128.000000	360.00000	1.000000
75%	5795.000000	2297.250000	168.000000	360.00000	1.000000
max	81000.000000	41667.000000	700.000000	480.00000	1.000000

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status	Total_Income
0	LP001002	Male	No	0	Graduate	No	5849	0.0	146.412162	360.0	1.0	Urban	Y	5849.0
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.000000	360.0	1.0	Rural	N	6091.0
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.000000	360.0	1.0	Urban	Y	3000.0
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.000000	360.0	1.0	Urban	Y	4941.0
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.000000	360.0	1.0	Urban	Y	6000.0

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status	Total_Income	ApplicantIncomeLog	CoapplicantIncomeLog	LoanAmountLog	Loan_Amount_Term_Log	Total_Income_Log
0	LP001002	Male	No	0	Graduate	No	5849	0.0	146.412162	360.0	1.0	Urban	Y	5849.0	8.674026	-inf	4.986426	5.886104	8.674026
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.000000	360.0	1.0	Rural	N	6091.0	8.430109	7.318540	4.852030	5.886104	8.714568
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.000000	360.0	1.0	Urban	Y	3000.0	8.006368	-inf	4.189655	5.886104	8.006368
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.000000	360.0	1.0	Urban	Y	4941.0	7.856707	7.765569	4.787492	5.886104	8.505323
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.000000	360.0	1.0	Urban	Y	6000.0	8.699515	-inf	4.948760	5.886104	8.699515

Ruth's Blog

Loan Eligibility Prediction

Dataset Information¶

Import Modules¶

Loading the Dataset¶

Preprocessing the Dataset¶

Exploratory Data Analysis¶

Creation of New Attributes¶

Log Transformation¶

Feature Scaling¶

Correlation Matrix¶

Attributes for Logistic Regression¶

Attributes for Decision Tree and Random Forest¶

Label Encoding¶

Preparation for Train-Test Split¶

Hyperparameter Tuning (Grid Search)¶

ROC Curve and AUC¶

Precision-Recall Curve, AUC, Precision, Recall and F1 Score¶

Logistic Regression¶

Decision Tree¶

Random Forest¶

Use ROC_AUC of test dataset as criterion for identifying the best parameter set for the relatively balanced dataset¶

Plot ROC Curves¶

Output as an Excel File¶

Confusion Matrix Using Best Parameter Set¶

	model	fpr	tpr	ROC_AUC
0	LogisticRegression	[0.0, 0.0, 0.0, 0.07142857142857142, 0.0714285...	[0.0, 0.020833333333333332, 0.2083333333333333...	0.739583
1	DecisionTreeClassifier	[0.0, 0.0, 0.0, 0.35714285714285715, 0.5714285...	[0.0, 0.020833333333333332, 0.2916666666666667...	0.747024
2	RandomForestClassifier	[0.0, 0.0, 0.0, 0.07142857142857142, 0.0714285...	[0.0, 0.020833333333333332, 0.125, 0.125, 0.39...	0.755952