import numpy as np # linear algebra
import pandas as pd # data processing


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bank-marketing/bank-additional-names.txt
/kaggle/input/bank-marketing/bank-additional-full.csv


df = pd.read_csv('../input/bank-marketing/bank-additional-full.csv', sep=";") #Full dataframe


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null  float64
 18  euribor3m       41188 non-null  float64
 19  nr.employed     41188 non-null  float64
 20  y               41188 non-null  object 
dtypes: float64(5), int64(5), object(11)
memory usage: 6.6+ MB


for col in df.iloc[:,1:10]:  #Printing categorical columns
    print(f'{col}: {df[col].unique()}')
    print('\n')

job: ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']


marital: ['married' 'single' 'divorced' 'unknown']


education: ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']


default: ['no' 'unknown' 'yes']


housing: ['no' 'yes' 'unknown']


loan: ['no' 'yes' 'unknown']


contact: ['telephone' 'cellular']


month: ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']


day_of_week: ['mon' 'tue' 'wed' 'thu' 'fri']


categorical_cols = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
non_cat = pd.get_dummies(df, columns=categorical_cols)


non_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 64 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41188 non-null  int64  
 1   duration                       41188 non-null  int64  
 2   campaign                       41188 non-null  int64  
 3   pdays                          41188 non-null  int64  
 4   previous                       41188 non-null  int64  
 5   emp.var.rate                   41188 non-null  float64
 6   cons.price.idx                 41188 non-null  float64
 7   cons.conf.idx                  41188 non-null  float64
 8   euribor3m                      41188 non-null  float64
 9   nr.employed                    41188 non-null  float64
 10  y                              41188 non-null  object 
 11  job_admin.                     41188 non-null  uint8  
 12  job_blue-collar                41188 non-null  uint8  
 13  job_entrepreneur               41188 non-null  uint8  
 14  job_housemaid                  41188 non-null  uint8  
 15  job_management                 41188 non-null  uint8  
 16  job_retired                    41188 non-null  uint8  
 17  job_self-employed              41188 non-null  uint8  
 18  job_services                   41188 non-null  uint8  
 19  job_student                    41188 non-null  uint8  
 20  job_technician                 41188 non-null  uint8  
 21  job_unemployed                 41188 non-null  uint8  
 22  job_unknown                    41188 non-null  uint8  
 23  marital_divorced               41188 non-null  uint8  
 24  marital_married                41188 non-null  uint8  
 25  marital_single                 41188 non-null  uint8  
 26  marital_unknown                41188 non-null  uint8  
 27  education_basic.4y             41188 non-null  uint8  
 28  education_basic.6y             41188 non-null  uint8  
 29  education_basic.9y             41188 non-null  uint8  
 30  education_high.school          41188 non-null  uint8  
 31  education_illiterate           41188 non-null  uint8  
 32  education_professional.course  41188 non-null  uint8  
 33  education_university.degree    41188 non-null  uint8  
 34  education_unknown              41188 non-null  uint8  
 35  default_no                     41188 non-null  uint8  
 36  default_unknown                41188 non-null  uint8  
 37  default_yes                    41188 non-null  uint8  
 38  housing_no                     41188 non-null  uint8  
 39  housing_unknown                41188 non-null  uint8  
 40  housing_yes                    41188 non-null  uint8  
 41  loan_no                        41188 non-null  uint8  
 42  loan_unknown                   41188 non-null  uint8  
 43  loan_yes                       41188 non-null  uint8  
 44  contact_cellular               41188 non-null  uint8  
 45  contact_telephone              41188 non-null  uint8  
 46  month_apr                      41188 non-null  uint8  
 47  month_aug                      41188 non-null  uint8  
 48  month_dec                      41188 non-null  uint8  
 49  month_jul                      41188 non-null  uint8  
 50  month_jun                      41188 non-null  uint8  
 51  month_mar                      41188 non-null  uint8  
 52  month_may                      41188 non-null  uint8  
 53  month_nov                      41188 non-null  uint8  
 54  month_oct                      41188 non-null  uint8  
 55  month_sep                      41188 non-null  uint8  
 56  day_of_week_fri                41188 non-null  uint8  
 57  day_of_week_mon                41188 non-null  uint8  
 58  day_of_week_thu                41188 non-null  uint8  
 59  day_of_week_tue                41188 non-null  uint8  
 60  day_of_week_wed                41188 non-null  uint8  
 61  poutcome_failure               41188 non-null  uint8  
 62  poutcome_nonexistent           41188 non-null  uint8  
 63  poutcome_success               41188 non-null  uint8  
dtypes: float64(5), int64(5), object(1), uint8(53)
memory usage: 5.5+ MB


non_cat.isnull().sum().sum()

0


non_cat.y = non_cat.y.map(dict(yes=1, no=0))


corr = non_cat.corr()
print(corr.y.sort_values(ascending=False)[:10])

y                   1.000000
duration            0.405274
poutcome_success    0.316269
previous            0.230181
contact_cellular    0.144773
month_mar           0.144014
month_oct           0.137366
month_sep           0.126067
default_no          0.099344
job_student         0.093955
Name: y, dtype: float64


import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')

x = corr.y.sort_values(ascending=False)[:10]


#plotting heatmap of the results from correation analysis
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr[['y']].sort_values(['y'],ascending=False))

<AxesSubplot:>


y = non_cat['y'].copy()
X = non_cat.drop('y',axis=1).copy()


from sklearn.preprocessing import StandardScaler
cols = X.columns
scaler = StandardScaler()
standard = scaler.fit_transform(X)
standard_df = pd.DataFrame(standard, columns =cols)


import seaborn as sns
ax = sns.countplot(y)

/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning


from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
'''
AUC ploting function that will be used later on
'''

def roc_auc(y_test,y_preds,roc_auc):
    fpr, tpr, threshold = roc_curve(y_test, y_preds)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(standard_df,y,test_size=0.2,random_state = 42)


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import operator 

k_value = [1,2,5,10,15,20]



def KNN(k_value,xtrain,ytrain,xtest,ytest):
    acc = []
    cv = []
    auc_score = []
    for i in k_value:                                    #iterate the k values
        classifier = KNeighborsClassifier(n_neighbors=i) #train multiple models and save their accuracy, CV and auc score
        classifier.fit(X_train,y_train) 
        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test,y_pred)
        cv_score = cross_val_score(classifier,X,y,cv=10)
        cv_mean = cv_score.mean()
        auc = roc_auc_score(y_test,y_pred)
        roc_auc(y_test,y_pred,auc)
        
        acc.append((i,accuracy))
        cv.append((i,cv_mean))
        auc_score.append((i, auc))
    best_acc = max(acc,key=operator.itemgetter(1))
    best_cv = max(cv,key=operator.itemgetter(1))
    best_auc = max(auc_score,key=operator.itemgetter(1))
    print(f'k: {best_acc[0]}, accuracy: {best_acc[1]},cross validated score: {best_cv[1]}, auc: {best_auc[1]}')
    
    return best_acc, best_cv, best_auc
    
   
    


KNN(k_value,X_train,y_train,X_test,y_test)

k: 5, accuracy: 0.8983976693372178,cross validated score: 0.8379283520227105, auc: 0.6591968284955051

((5, 0.8983976693372178), (20, 0.8379283520227105), (1, 0.6591968284955051))


from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import roc_auc_score


def DTC(xtrain,ytrain,xtest,ytest):
    classifier = DecisionTreeClassifier()
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cv_score = cross_val_score(classifier,X,y,cv=10)
    auc = roc_auc_score(y_test,y_pred)
    roc_auc(y_test,y_pred,auc)
    print(f'Decision Tree accuracy: {accuracy},cross validated score: {cv_score.mean()}, auc: {auc}')
    

DTC(X_train,y_train,X_test,y_test)

Decision Tree accuracy: 0.8865015780529255,cross validated score: 0.6499352436457827, auc: 0.7298831115481808


from sklearn import svm

def SVM(xtrain,ytrain,xtest,ytest):
    classifier = svm.SVC(kernel='poly')
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cv_score = cross_val_score(classifier,X,y,cv=10)
    auc = roc_auc_score(y_test,y_pred)
    roc_auc(y_test,y_pred,auc)
    print(f'SVM accuracy: {accuracy},cross validated score: {cv_score.mean()}, auc: {auc}')
    
SVM(X_train,y_train,X_test,y_test)

SVM accuracy: 0.9044671036659383,cross validated score: 0.8944343493548713, auc: 0.6635436612746501


from xgboost import XGBClassifier

def XGB(X_train,y_train,X_test,y_test):
    classifier = XGBClassifier()
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cv_score = cross_val_score(classifier,X,y,cv=10)
    auc = roc_auc_score(y_test,y_pred)
    roc_auc(y_test,y_pred,auc)
    print(f'XGB accuracy: {accuracy},cross validated score: {cv_score.mean()}, auc: {auc}')
    
XGB(X_train,y_train,X_test,y_test)

[10:14:10] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[10:14:14] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:18] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:23] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:27] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:31] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:35] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:40] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:44] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:48] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[10:14:52] WARNING: ../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

XGB accuracy: 0.9139354212187424,cross validated score: 0.672345434588595, auc: 0.7546820916757526


from imblearn.over_sampling import SMOTE


sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print(f'Before SMOTE count of label 1: {sum(y_train==1)}')
print(f'Before SMOTE count of label 0: {sum(y_train==0)}')

print('\n')

print(f'After SMOTE count of label 1: {sum(y_train_res==1)}')
print(f'After SMOTE count of label 0: {sum(y_train_res==0)}')

Before SMOTE count of label 1: 3705
Before SMOTE count of label 0: 29245


After SMOTE count of label 1: 29245
After SMOTE count of label 0: 29245


X_test_res, y_test_res = sm.fit_resample(X_test, y_test.ravel())

print(f'Before SMOTE count of label 1: {sum(y_test==1)}')
print(f'Before SMOTE count of label 0: {sum(y_test==0)}')

print('\n')

print(f'After SMOTE count of label 1: {sum(y_test_res==1)}')
print(f'After SMOTE count of label 0: {sum(y_test_res==0)}')

Before SMOTE count of label 1: 935
Before SMOTE count of label 0: 7303


After SMOTE count of label 1: 7303
After SMOTE count of label 0: 7303


from imblearn.under_sampling import NearMiss

nr = NearMiss()

X_train_miss, y_train_miss = nr.fit_resample(X_train, y_train.ravel())

print(f'Before NearMiss count of label 1: {sum(y_train==1)}')
print(f'Before NearMiss count of label 0: {sum(y_train==0)}')

print('\n')

print(f'After NearMiss count of label 1: {sum(y_train_miss==1)}')
print(f'After NearMiss count of label 0: {sum(y_train_miss==0)}')

Before NearMiss count of label 1: 3705
Before NearMiss count of label 0: 29245


After NearMiss count of label 1: 3705
After NearMiss count of label 0: 3705


X_test_miss, y_test_miss = nr.fit_resample(X_test, y_test.ravel())

print(f'Before NearMiss count of label 1: {sum(y_test==1)}')
print(f'Before NearMiss count of label 0: {sum(y_test==0)}')

print('\n')

print(f'After NearMiss count of label 1: {sum(y_test_miss==1)}')
print(f'After NearMiss count of label 0: {sum(y_test_miss==0)}')

Before NearMiss count of label 1: 935
Before NearMiss count of label 0: 7303


After NearMiss count of label 1: 935
After NearMiss count of label 0: 935


from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn import svm
from xgboost import XGBClassifier
import pickle 
import os
import warnings
warnings.filterwarnings("ignore")

'''
Takes input as X_train,y_train,X_test and y_test.
train_KNN: k-Nearest Neighbor classifier
train_SVM: Support Vector Machine classifier
train_DTC: Decision Tree classifier
train_XGB: Gradient Boost classifier
roc_auc : plots the AUC function for a trained model

test: Returns accuracy, cross validated accuracy and auc score

train: Trains the different models based on the user input preference

predict: predict new values.Takes input as array of shape (1,63), outputs classified label
'''



class BankMarket():                    
    def __init__(self,xtr,ytr,xte,yte):
        self.xtr = xtr
        self.ytr = ytr
        self.xte = xte
        self.tye = yte
       
        
    def train_KNN(self,X_train,y_train):
        
        classifier = KNeighborsClassifier(n_neighbors=20)
        classifier.fit(X_train,y_train)
        print('Finished Training, Begin Testing')
        
        return classifier
    
    def train_DTC(self,X_train,y_train):
       
        classifier = DecisionTreeClassifier()
        classifier.fit(X_train,y_train)
        print('Finished Training, Begin Testing')
        
        return classifier
    
    def train_SVM(self,X_train,y_train):
        
        classifier = DecisionTreeClassifier()
        classifier.fit(X_train,y_train)
        print('Finished Training, Begin Testing')
        
        return classifier
    
    def train_XGB(self,X_train,y_train):
        
        classifier = XGBClassifier(eval_metric='logloss')
        classifier.fit(X_train,y_train)
        print('Finished Training, Begin Testing')
        
        return classifier
    
    def roc_auc(self,y_test,y_preds,roc_auc,algorithm):
        fpr, tpr, threshold = roc_curve(y_test, y_preds)
    
        plt.title(f'Receiver Operating Characteristic: {algorithm}')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()
        
    def save(self,filename,model):
        pickle.dump(model,open(filename,'wb'))
    
    def test(self,classifier,X_test,y_test,algorithm):
        print('\n')
        
        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test,y_pred)
        cv_score = cross_val_score(classifier,X_train,y_train,cv=10)
        auc = roc_auc_score(y_test,y_pred)
        self.roc_auc(y_test,y_pred,auc,algorithm)
        print(f'{algorithm} finished testing with accuracy as {accuracy}, cv_score as {cv_score.mean()}, auc score as {auc}')
        
        return y_pred, accuracy, cv_score,auc

   
    def train(self,xtr,ytr,xte,yte):
        print('Select type of algorithm')
        print('options:','KNN','DTC','SVM','XGB')
        clf = input()
        while clf not in ('KNN','DTC','SVM','XGB'):
            print('Please Enter Valid Algorithm')
            clf = input()
        if clf == 'KNN':
            classifier = self.train_KNN(xtr,ytr)
            
        elif clf == 'DTC':
            classifier = self.train_DTC(xtr,ytr)
            
        elif clf =='SVM':
            classifier = self.train_SVM(xtr,ytr)
        elif clf == 'XGB':
            classifier = self.train_XGB(xtr,ytr)
            
        y_pred, accuracy, cv_score,auc = self.test(classifier, xte,yte,clf)
        filename = clf+'.sav'
        
        self.save(filename,classifier)
            
        
    def predict(self,new_input):
        print('Give model save path')
        path = input()
     
        model = pickle.load(open(path,'rb'))
        pred = model.predict(new_input)
        if np.all(pred==0):
            print('Negative subscription')
        else:
            print('Positive subscription')


bank = BankMarket(X_train, y_train,X_test,y_test)
bank.train(X_train, y_train,X_test,y_test)

Select type of algorithm
options: KNN DTC SVM XGB
 XGB
Finished Training, Begin Testing

XGB finished testing with accuracy as 0.9139354212187424, cv_score as 0.9140819423368741, auc score as 0.7546820916757526


bank_up = BankMarket(X_train_res, y_train_res,X_test_res,y_test_res)
bank_up.train(X_train_res, y_train_res,X_test_res,y_test_res)

Select type of algorithm
options: KNN DTC SVM XGB
 XGB
Finished Training, Begin Testing

XGB finished testing with accuracy as 0.9489935642886485, cv_score as 0.9140819423368741, auc score as 0.9489935642886486


bank = BankMarket(X_train_miss, y_train_miss,X_test_miss,y_test_miss)
bank.train(X_train_miss, y_train_miss,X_test_miss,y_test_miss)

Select type of algorithm
options: KNN DTC SVM XGB
 XGB
Finished Training, Begin Testing

XGB finished testing with accuracy as 0.8866310160427807, cv_score as 0.9140819423368741, auc score as 0.8866310160427808


#giving a random example for the lack of new data. But any (1,63) shape array works
random_ip = np.random.uniform(-2,2,(1,63))
bank.predict(random_ip)

Give model save path
 ./XGB.sav
Positive subscription

EXPLORATORY ANALYSIS OF CLASSIFICATION MODELS WITH SKLEARN AND BANK MARKETING DATA. ¶

Abstract:¶

Source:¶

Loading and getting ideas about the data¶

Many categorical columns. Lets see the unique values of these columns and decide to do one hot encoder or something different.¶

Since unknown values are a binary columns, taking the mean is not good. So unknown is kept a feature and all categorical columns are transformed with one hot encoding. Now we have 64 features in total.¶

No categorical values, no NULL values. Changing target variable to numerical.¶

Correlation analysis¶

Top 10 higly correlated columns with the target variable.¶

Splitting target and predictor variables¶

Standardizing the predictor variables¶

Sample Distribution¶

AUC_ROC function¶

Train_test split¶

Training models¶

KNN¶

Decision Tree classifier.¶

SVM¶

XGB classifier¶

Thoughts¶

Feature Engineering: Up-Sampling with SMOTE¶

Train Set¶

Test set¶

Feature Engineering: Down-Sampling with NearMiss¶

Train Set¶

Test Set¶

BankMarket Class¶

Original non engineered data¶

Train with upsampled data¶

Train with Down-sampled data¶

Predicting new inputs to get a label class¶

Can input any new values, will output label.¶

Great. Apparently the randomly generated 63 values were predicted as positive subscription. Now we have a classifier with AUC of 89 for predicting whether a subject will be a subscriber or not.¶

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
0	56	housemaid	married	basic.4y	no	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
1	57	services	married	high.school	unknown	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
2	37	services	married	high.school	no	yes	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
3	40	admin.	married	basic.6y	no	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
4	56	services	married	high.school	no	no	yes	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no