Classification with PLS-DA#
Author: Nathan A. Mahynski
Date: 2023/08/23
Description: Illustrate modeling with PLS-DA.
[ ]:
if 'google.colab' in str(get_ipython()):
!pip install git+https://github.com/mahynski/pychemauth@main
import os
os.kill(os.getpid(), 9) # Automatically restart the runtime to reload libraries
[1]:
try:
import pychemauth
except:
raise ImportError("pychemauth not installed")
import matplotlib.pyplot as plt
%matplotlib inline
import watermark
%load_ext watermark
%load_ext autoreload
%autoreload 2
[2]:
import imblearn
import sklearn
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
[3]:
%watermark -t -m -v --iversions
Python implementation: CPython
Python version : 3.10.12
IPython version : 7.34.0
Compiler : GCC 11.4.0
OS : Linux
Release : 6.1.85+
Machine : x86_64
Processor : x86_64
CPU cores : 2
Architecture: 64bit
pychemauth: 0.0.0b4
sklearn : 1.3.0
matplotlib: 3.7.2
pandas : 1.5.3
watermark : 2.4.3
imblearn : 0.11.0
numpy : 1.24.3
Load the Data
[4]:
from sklearn.datasets import load_iris as load_data
X, y = load_data(return_X_y=True, as_frame=True)
[5]:
# Let's turn the indices into names
names = dict(zip(np.arange(3), ['setosa', 'versicolor', 'virginica']))
y = y.apply(lambda x: names[x])
[6]:
X.head(5)
[6]:
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
[7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X.values,
y.values,
shuffle=True,
random_state=42,
test_size=0.2,
stratify=y # It is usually important to balance the test and train set so they have the same fraction of classes
)
Fitting a PLS-DA Model#
[8]:
from pychemauth.classifier.plsda import PLSDA
Training a Hard Model
“Hard” PLS-DA models use hyperplanes to divide the latent (score) space into distinct regions, 1 and only 1 of which is associated with a class.
[9]:
hard_plsda = PLSDA(
n_components=3,
alpha=0.05,
gamma=0.01,
not_assigned='UNKNOWN', # This needs to match the same datatype as y (here, y is str); for integers, e.g., use -1
style="hard",
scale_x=True,
score_metric='TEFF' # Let's use TEFF as the model's scoring metric. This could be TSNS or TSPS instead.
)
[10]:
_ = hard_plsda.fit(X_train, y_train)
[11]:
# 3 class systems can be plotted in 2D, 2 class systems in 1D; more than 3 cannot be shown. `visualize` automatically
# selects the right version for your.
_ = hard_plsda.visualize(styles=['hard'])
[12]:
# We can see what samples are predicted to be using the predict() function.
pred = hard_plsda.predict(X_train)
[13]:
print(
'Test set TEFF = {}\nTrain set TEFF = {}'.format(
'%.3f'%hard_plsda.score(X_test, y_test),
'%.3f'%hard_plsda.score(X_train, y_train)
)
)
Test set TEFF = 0.733
Train set TEFF = 0.750
For Hard PLS-DA each point is predicted to belong to 1 and only 1 class, so TEFF = TSPS = TSNS
[14]:
# The score() function is just testing how many are correctly predicted. You can do this directly and
# easily with the "hard" version of PLS-DA.
fom = hard_plsda.figures_of_merit(pred, y_train)
print(fom['TSNS'], fom['TSPS'], fom['TEFF'])
0.75 0.75 0.75
[15]:
# This is also equivalent to the accuracy of the model
np.sum(np.array(pred).ravel() == y_train) / y_train.shape[0]
[15]:
0.75
[16]:
# Each row is what the sample ACTUALLY is, each column is what the PREDICTION is.
# Here, versicolor is predicted to be setosa 19 times.
fom['CM']
[16]:
| setosa | versicolor | virginica | UNKNOWN | |
|---|---|---|---|---|
| setosa | 39 | 1 | 0 | 0 |
| versicolor | 19 | 18 | 3 | 0 |
| virginica | 1 | 6 | 33 | 0 |
[17]:
# Total for each category
fom['I']
[17]:
setosa 40
versicolor 40
virginica 40
dtype: int64
[18]:
# Class sensitivity
fom['CSNS']
[18]:
setosa 0.975
versicolor 0.450
virginica 0.825
dtype: float64
[19]:
# Class specificity
fom['CSPS']
[19]:
setosa 0.7500
versicolor 0.9125
virginica 0.9625
dtype: float64
[20]:
# Class efficiency
fom['CEFF']
[20]:
setosa 0.855132
versicolor 0.640800
virginica 0.891102
dtype: float64
Training a Soft Model
“Soft” PLS-DA models use ellipses to divide the latent (score) space into possibly overlapping regions, meaning that a point can be assigned to 0, 1, or > 1 classes.
[21]:
# Here the data are elemental levels so we will scale the X data
soft_plsda = PLSDA(
n_components=3,
alpha=0.05,
gamma=0.01,
not_assigned='UNKNOWN',
style="soft",
scale_x=True,
score_metric='TEFF'
)
[22]:
_ = soft_plsda.fit(X_train, y_train)
[27]:
# You can visualize both the hard and soft boundaries if you train a soft model.
# With a hard model, you only get the hard boundaries by default.
_ = soft_plsda.visualize(styles=['hard', 'soft'])
[28]:
# We can see what samples are predicted to be using the predict() function.
train_pred = soft_plsda.predict(X_train)
[29]:
# As shown above, there are no outliers in the training data.
# Only the soft version of PLSDA can do this check.
soft_plsda.check_outliers().any()
[29]:
False
[30]:
print(
'Test set TEFF = {}\nTrain set TEFF = {}'.format(
'%.3f'%soft_plsda.score(X_test, y_test),
'%.3f'%soft_plsda.score(X_train, y_train)
)
)
Test set TEFF = 0.983
Train set TEFF = 0.975
[31]:
# Samples can now be predicted to belong to multiple classes.
train_pred[:10]
[31]:
[['setosa'],
['virginica'],
['versicolor'],
['setosa'],
['versicolor'],
['virginica'],
['versicolor'],
['versicolor', 'virginica'],
['virginica'],
['virginica']]
[32]:
# More complete figures of merit can be computed.
fom = soft_plsda.figures_of_merit(train_pred, y_train)
# For Soft PLS-DA TEFF, TSPS, and TSNS are generally different
print(fom['TSNS'], fom['TSPS'], fom['TEFF'])
0.9666666666666667 0.9833333333333333 0.9749643868139777
[33]:
fom['I']
[33]:
setosa 40
versicolor 40
virginica 40
dtype: int64
[34]:
# The soft model can predict that things are unknown and that they might belong to multiple classes.
# Observe that there of 40 versicolor samples, there are 40 assignments to versicolor and 4 to virginica (40 + 4 > 40),
# because some are assigned to both categories.
fom['CM']
[34]:
| setosa | versicolor | virginica | UNKNOWN | |
|---|---|---|---|---|
| setosa | 38 | 0 | 0 | 2 |
| versicolor | 0 | 39 | 3 | 0 |
| virginica | 0 | 1 | 39 | 1 |
[35]:
fom = soft_plsda.figures_of_merit(soft_plsda.predict(X_test), y_test)
fom['CM']
[35]:
| setosa | versicolor | virginica | UNKNOWN | |
|---|---|---|---|---|
| setosa | 10 | 0 | 0 | 0 |
| versicolor | 0 | 10 | 1 | 0 |
| virginica | 0 | 1 | 10 | 0 |
[38]:
# We can plot the test on the training set like this
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,6))
ax = soft_plsda.visualize(styles=['soft'], ax=ax, show_training=False)
test_set_proj = soft_plsda.transform(X_test)
for color_, class_ in zip(['cyan', 'magenta', 'black'], names.values()):
mask = y_test == class_
ax.plot(test_set_proj[mask,0], test_set_proj[mask,1], '*', color=color_, label='{} (Test)'.format(class_))
ax.legend(loc='best')
[38]:
<matplotlib.legend.Legend at 0x7ac3538384c0>
Optimizing the Model#
Here we took alpha as a meaningful choice of type I error rate, but it could also be adjusted. Moreover, we arbitrarily selected the number of PCs to use in the PLSDA model. We can use scikit-learn’s pipelines to automatically optimize hyperparameters like this.
[39]:
pipeline = imblearn.pipeline.Pipeline(
steps=[
("plsda", PLSDA(
n_components=5,
alpha=0.05,
scale_x=True,
not_assigned='UNKNOWN',
style='soft',
score_metric='TEFF'
)
)
])
# Let's optimize the TEFF of the model and allow alpha to vary - this is a "compliant" approach.
param_grid = [{
'plsda__n_components':[3, 4],
'plsda__alpha': [0.07, 0.05, 0.03, 0.01],
}]
gs = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
n_jobs=-1,
cv=sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=0),
error_score=0,
refit=True
)
_ = gs.fit(X_train, y_train)
[40]:
# The best parameters found can be accessed like this:
gs.best_params_
[40]:
{'plsda__alpha': 0.03, 'plsda__n_components': 3}
[41]:
gs.best_score_ # The best score (TEFF) it recieved was...
[41]:
0.9727739913024074
[42]:
# You can see detailed CV results here
gs.cv_results_
[42]:
{'mean_fit_time': array([0.01206279, 0.01201566, 0.01257658, 0.01141119, 0.01039139,
0.00962798, 0.00946887, 0.01093936]),
'std_fit_time': array([0.00088673, 0.00081104, 0.00274705, 0.00102592, 0.00011664,
0.00012437, 0.0001569 , 0.00038004]),
'mean_score_time': array([0.02674349, 0.02722987, 0.02487612, 0.02466146, 0.02466027,
0.02299309, 0.02398237, 0.02348137]),
'std_score_time': array([0.00175109, 0.00115925, 0.00034594, 0.00036911, 0.00122064,
0.00117518, 0.00139078, 0.00226715]),
'param_plsda__alpha': masked_array(data=[0.07, 0.07, 0.05, 0.05, 0.03, 0.03, 0.01, 0.01],
mask=[False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'param_plsda__n_components': masked_array(data=[3, 4, 3, 4, 3, 4, 3, 4],
mask=[False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'plsda__alpha': 0.07, 'plsda__n_components': 3},
{'plsda__alpha': 0.07, 'plsda__n_components': 4},
{'plsda__alpha': 0.05, 'plsda__n_components': 3},
{'plsda__alpha': 0.05, 'plsda__n_components': 4},
{'plsda__alpha': 0.03, 'plsda__n_components': 3},
{'plsda__alpha': 0.03, 'plsda__n_components': 4},
{'plsda__alpha': 0.01, 'plsda__n_components': 3},
{'plsda__alpha': 0.01, 'plsda__n_components': 4}],
'split0_test_score': array([0.9812301 , 0.9617692 , 0.9812301 , 0.9812301 , 0.9812301 ,
0.9812301 , 0.949671 , 0.98107084]),
'split1_test_score': array([0.9367497 , 0.94273538, 0.975 , 0.96856853, 0.98742088,
0.98742088, 0.96824584, 0.96824584]),
'split2_test_score': array([0.90260041, 0.92954962, 0.92954962, 0.92954962, 0.949671 ,
0.93072552, 0.95622957, 0.94356372]),
'mean_test_score': array([0.9401934 , 0.94468474, 0.96192657, 0.95978275, 0.97277399,
0.96645883, 0.9580488 , 0.96429347]),
'std_test_score': array([0.03219266, 0.01322561, 0.02303481, 0.02199409, 0.01653063,
0.02539335, 0.00769148, 0.01556517]),
'rank_test_score': array([8, 7, 4, 5, 1, 2, 6, 3], dtype=int32)}
[43]:
# The refit=True (default) refits the model with best parameters on the entire training set at the end so you can
# use the pipeline directly.
gs.predict(X_test)[:10]
[43]:
[['setosa'],
['virginica'],
['versicolor'],
['versicolor'],
['setosa'],
['versicolor'],
['setosa'],
['setosa'],
['virginica'],
['versicolor']]
[44]:
# Within the pipeline you can access individual steps, and their methods.
_ = gs.best_estimator_.named_steps['plsda'].visualize(styles=['hard', 'soft'])
[45]:
gs.best_estimator_.named_steps['plsda'].score(X_train, y_train) # This is TEFF
[45]:
0.9791578013783069
[46]:
train_pred = gs.best_estimator_.named_steps['plsda'].predict(X_train)
fom = gs.best_estimator_.named_steps['plsda'].figures_of_merit(train_pred, y_train)
[47]:
fom['CM']
[47]:
| setosa | versicolor | virginica | UNKNOWN | |
|---|---|---|---|---|
| setosa | 39 | 0 | 0 | 1 |
| versicolor | 0 | 40 | 4 | 0 |
| virginica | 0 | 2 | 39 | 1 |
[48]:
fom['I']
[48]:
setosa 40
versicolor 40
virginica 40
dtype: int64
[49]:
fom['CSNS']
[49]:
setosa 0.975
versicolor 1.000
virginica 0.975
dtype: float64
[50]:
fom['CSPS']
[50]:
setosa 1.000
versicolor 0.975
virginica 0.950
dtype: float64
[51]:
fom['CEFF']
[51]:
setosa 0.987421
versicolor 0.987421
virginica 0.962419
dtype: float64
[52]:
fom['TSPS'], fom['TSNS'], fom['TEFF']
[52]:
(0.975, 0.9833333333333333, 0.9791578013783069)
[53]:
np.any(gs.best_estimator_.named_steps['plsda'].check_outliers())
[53]:
False
[54]:
# Test set performance
gs.best_estimator_.named_steps['plsda'].score(X_test, y_test) # The score being used here is TEFF
[54]:
0.9746794344808963
[55]:
test_pred = gs.best_estimator_.named_steps['plsda'].predict(X_test)
fom = gs.best_estimator_.named_steps['plsda'].figures_of_merit(test_pred, y_test)
[56]:
fom['CM']
[56]:
| setosa | versicolor | virginica | UNKNOWN | |
|---|---|---|---|---|
| setosa | 10 | 0 | 0 | 0 |
| versicolor | 0 | 10 | 1 | 0 |
| virginica | 0 | 2 | 10 | 0 |