Model selection is the process of selecting one final machine learning model from among a collection of candidate machine learning models for a training dataset.
Model selection is a crucial step when working on machine learning projects that can significantly impact the accuracy and efficiency of the projects.
Model selection using cross-validation
Importing dependencies
import numpy as np import pandas as pd from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV import warnings warnings.simplefilter('ignore')
Importing models from scikit-learn
from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier
Loading dataset
df = pd.read_csv('heart.csv') df.head()
Segregating features
X = df.drop(columns='target',axis=1) y = df['target'] X = np.asarray(X) y = np.asarray(y)
Comparing models
# List of models models = [ LogisticRegression(max_iter=1000), SVC(kernel='linear'),KNeighborsClassifier(), RandomForestClassifier(random_state=0) ] # Function for comparing models def compare_models_cv(): for model in models: cv_score = cross_val_score(model,X,y,cv=5) mean_score = round(cv_score.mean()*100,2) print(f'Cross Validation accuracy score for the model {model} = {cv_score}') print(f'Accuracy score of the model {model} = {mean_score}%') print('-'*100) # Calling function compare_models_cv()
output:
Comparing the model with hyperparameter tuning
# list of model models = [ LogisticRegression(max_iter=10000), SVC(), KNeighborsClassifier(), RandomForestClassifier(random_state=0) ] # creating paramters dict parameter = { 'log_reg_hyperparameters': { 'C' : [1,5,10,20] }, 'svc_hyperparameters': { 'kernel' : ['linear','poly','rbf','sigmoid'], 'C' : [1,5,10,20] }, 'KNN_hyperparameters' : { 'n_neighbors' : [3,5,10] }, 'random_forest_hyperparameters' : { 'n_estimators' : [10, 20, 50, 100] } } # creating list of dict model_keys = list(parameter) # creating function for comparing model result = [] def modelSelection(list_model,hyperparameters): i = 0 for model in models: key = model_keys[i] params = parameter[key] i += 1 clf = GridSearchCV(model,params,cv=5) clf.fit(X,y) result.append({ 'Model used':model, 'Highest score':clf.best_score_, 'Best parameters':clf.best_params_ }) # calling function modelSelection(models,parameter)
Creating data frame of results
pd.DataFrame(result)
| | Model used | Highest score | Best parameters | | --- | --- | --- | --- | | 0 | LogisticRegression(max_iter=10000) | 0.834863 | {'C': 5} | | 1 | SVC() | 0.828306 | {'C': 1, 'kernel': 'linear'} | | 2 | KNeighborsClassifier() | 0.643880 | {'n_neighbors': 5} | | 3 | RandomForestClassifier(random_state=0) | 0.838087 | {'n_estimators': 100} |