Model Selection

Model Selection

Model selection is the process of selecting one final machine learning model from among a collection of candidate machine learning models for a training dataset.

Model selection is a crucial step when working on machine learning projects that can significantly impact the accuracy and efficiency of the projects.


Model selection using cross-validation

  1. Importing dependencies

     import numpy as np
     import pandas as pd
     from sklearn.model_selection import cross_val_score
     from sklearn.model_selection import GridSearchCV
     import warnings
     warnings.simplefilter('ignore')
    
  2. Importing models from scikit-learn

     from sklearn.linear_model import LogisticRegression
     from sklearn.svm import SVC
     from sklearn.neighbors import KNeighborsClassifier
     from sklearn.ensemble import RandomForestClassifier
    
  3. Loading dataset

     df = pd.read_csv('heart.csv')
     df.head()
    
  4. Segregating features

     X = df.drop(columns='target',axis=1)
     y = df['target']
     X = np.asarray(X)
     y = np.asarray(y)
    
  5. Comparing models

     # List of models
     models = [
         LogisticRegression(max_iter=1000),
         SVC(kernel='linear'),KNeighborsClassifier(),
         RandomForestClassifier(random_state=0)
     ]
    
     # Function for comparing models
     def compare_models_cv():
         for model in models:
             cv_score = cross_val_score(model,X,y,cv=5)
             mean_score = round(cv_score.mean()*100,2)
             print(f'Cross Validation accuracy score for the model {model} = {cv_score}')
             print(f'Accuracy score of the model {model} = {mean_score}%')
             print('-'*100)
    
     # Calling function
     compare_models_cv()
    

    output:

  6. Comparing the model with hyperparameter tuning

     # list of model
     models = [
         LogisticRegression(max_iter=10000),
         SVC(),
         KNeighborsClassifier(),
         RandomForestClassifier(random_state=0)
     ]
    
     # creating paramters dict
     parameter = {
         'log_reg_hyperparameters': { 
             'C' : [1,5,10,20]
         },
         'svc_hyperparameters': {
             'kernel' : ['linear','poly','rbf','sigmoid'],
             'C' : [1,5,10,20]
         },
         'KNN_hyperparameters' : {   
             'n_neighbors' : [3,5,10]
         },
         'random_forest_hyperparameters' : {  
             'n_estimators' : [10, 20, 50, 100]
         }
     }
    
     # creating list of dict
     model_keys = list(parameter)        
    
     # creating function for comparing model
     result = []
     def modelSelection(list_model,hyperparameters):
    
         i = 0
         for model in models:
             key = model_keys[i]
             params = parameter[key]
             i += 1
             clf = GridSearchCV(model,params,cv=5)
             clf.fit(X,y)
             result.append({
                 'Model used':model,
                 'Highest score':clf.best_score_,
                 'Best parameters':clf.best_params_
             })
     # calling function
     modelSelection(models,parameter)
    
  7. Creating data frame of results

     pd.DataFrame(result)
    

    | | Model used | Highest score | Best parameters | | --- | --- | --- | --- | | 0 | LogisticRegression(max_iter=10000) | 0.834863 | {'C': 5} | | 1 | SVC() | 0.828306 | {'C': 1, 'kernel': 'linear'} | | 2 | KNeighborsClassifier() | 0.643880 | {'n_neighbors': 5} | | 3 | RandomForestClassifier(random_state=0) | 0.838087 | {'n_estimators': 100} |