Model Selection

Model selection is the process of selecting one final machine learning model from among a collection of candidate machine learning models for a training dataset.

Model selection is a crucial step when working on machine learning projects that can significantly impact the accuracy and efficiency of the projects.

Model selection using cross-validation

Importing dependencies

 import numpy as np
 import pandas as pd
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import GridSearchCV
 import warnings
 warnings.simplefilter('ignore')

Importing models from scikit-learn

 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier

Loading dataset

 df = pd.read_csv('heart.csv')
 df.head()

Segregating features

 X = df.drop(columns='target',axis=1)
 y = df['target']
 X = np.asarray(X)
 y = np.asarray(y)

Comparing models

 # List of models
 models = [
     LogisticRegression(max_iter=1000),
     SVC(kernel='linear'),KNeighborsClassifier(),
     RandomForestClassifier(random_state=0)
 ]

 # Function for comparing models
 def compare_models_cv():
     for model in models:
         cv_score = cross_val_score(model,X,y,cv=5)
         mean_score = round(cv_score.mean()*100,2)
         print(f'Cross Validation accuracy score for the model {model} = {cv_score}')
         print(f'Accuracy score of the model {model} = {mean_score}%')
         print('-'*100)

 # Calling function
 compare_models_cv()

output:

Comparing the model with hyperparameter tuning

 # list of model
 models = [
     LogisticRegression(max_iter=10000),
     SVC(),
     KNeighborsClassifier(),
     RandomForestClassifier(random_state=0)
 ]

 # creating paramters dict
 parameter = {
     'log_reg_hyperparameters': { 
         'C' : [1,5,10,20]
     },
     'svc_hyperparameters': {
         'kernel' : ['linear','poly','rbf','sigmoid'],
         'C' : [1,5,10,20]
     },
     'KNN_hyperparameters' : {   
         'n_neighbors' : [3,5,10]
     },
     'random_forest_hyperparameters' : {  
         'n_estimators' : [10, 20, 50, 100]
     }
 }

 # creating list of dict
 model_keys = list(parameter)        

 # creating function for comparing model
 result = []
 def modelSelection(list_model,hyperparameters):

     i = 0
     for model in models:
         key = model_keys[i]
         params = parameter[key]
         i += 1
         clf = GridSearchCV(model,params,cv=5)
         clf.fit(X,y)
         result.append({
             'Model used':model,
             'Highest score':clf.best_score_,
             'Best parameters':clf.best_params_
         })
 # calling function
 modelSelection(models,parameter)

Creating data frame of results
```
 pd.DataFrame(result)
```
| | Model used | Highest score | Best parameters | | --- | --- | --- | --- | | 0 | LogisticRegression(max_iter=10000) | 0.834863 | {'C': 5} | | 1 | SVC() | 0.828306 | {'C': 1, 'kernel': 'linear'} | | 2 | KNeighborsClassifier() | 0.643880 | {'n_neighbors': 5} | | 3 | RandomForestClassifier(random_state=0) | 0.838087 | {'n_estimators': 100} |

https://github.com/Anirudhrarao/Machine-Learning-essential/blob/main/Modelselection.ipynb