Classification with PLS-DA#

Author: Nathan A. Mahynski

Date: 2023/08/23

Description: Illustrate modeling with PLS-DA.

Open In Colab

[ ]:
if 'google.colab' in str(get_ipython()):
    !pip install git+https://github.com/mahynski/pychemauth@main
    import os
    os.kill(os.getpid(), 9) # Automatically restart the runtime to reload libraries
[1]:
try:
    import pychemauth
except:
    raise ImportError("pychemauth not installed")

import matplotlib.pyplot as plt
%matplotlib inline

import watermark
%load_ext watermark

%load_ext autoreload
%autoreload 2
[2]:
import imblearn
import sklearn

from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd
[3]:
%watermark -t -m -v --iversions
Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pychemauth: 0.0.0b4
sklearn   : 1.3.0
matplotlib: 3.7.2
pandas    : 1.5.3
watermark : 2.4.3
imblearn  : 0.11.0
numpy     : 1.24.3

Load the Data

[4]:
from sklearn.datasets import load_iris as load_data
X, y = load_data(return_X_y=True, as_frame=True)
[5]:
# Let's turn the indices into names
names = dict(zip(np.arange(3), ['setosa', 'versicolor', 'virginica']))
y = y.apply(lambda x: names[x])
[6]:
X.head(5)
[6]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
[7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X.values,
    y.values,
    shuffle=True,
    random_state=42,
    test_size=0.2,
    stratify=y # It is usually important to balance the test and train set so they have the same fraction of classes
)

Fitting a PLS-DA Model#

[8]:
from pychemauth.classifier.plsda import PLSDA

Training a Hard Model

“Hard” PLS-DA models use hyperplanes to divide the latent (score) space into distinct regions, 1 and only 1 of which is associated with a class.

[9]:
hard_plsda = PLSDA(
    n_components=3,
    alpha=0.05,
    gamma=0.01,
    not_assigned='UNKNOWN', # This needs to match the same datatype as y (here, y is str); for integers, e.g., use -1
    style="hard",
    scale_x=True,
    score_metric='TEFF' # Let's use TEFF as the model's scoring metric.  This could be TSNS or TSPS instead.
)
[10]:
_ = hard_plsda.fit(X_train, y_train)
[11]:
# 3 class systems can be plotted in 2D, 2 class systems in 1D; more than 3 cannot be shown. `visualize` automatically
# selects the right version for your.
_ = hard_plsda.visualize(styles=['hard'])
../../_images/jupyter_api_plsda_16_0.png
[12]:
# We can see what samples are predicted to be using the predict() function.
pred = hard_plsda.predict(X_train)
[13]:
print(
    'Test set TEFF = {}\nTrain set TEFF = {}'.format(
        '%.3f'%hard_plsda.score(X_test, y_test),
        '%.3f'%hard_plsda.score(X_train, y_train)
    )
)
Test set TEFF = 0.733
Train set TEFF = 0.750

For Hard PLS-DA each point is predicted to belong to 1 and only 1 class, so TEFF = TSPS = TSNS

[14]:
# The score() function is just testing how many are correctly predicted.  You can do this directly and
# easily with the "hard" version of PLS-DA.
fom = hard_plsda.figures_of_merit(pred, y_train)

print(fom['TSNS'], fom['TSPS'], fom['TEFF'])
0.75 0.75 0.75
[15]:
# This is also equivalent to the accuracy of the model
np.sum(np.array(pred).ravel() == y_train) / y_train.shape[0]
[15]:
0.75
[16]:
# Each row is what the sample ACTUALLY is, each column is what the PREDICTION is.
# Here, versicolor is predicted to be setosa 19 times.

fom['CM']
[16]:
setosa versicolor virginica UNKNOWN
setosa 39 1 0 0
versicolor 19 18 3 0
virginica 1 6 33 0
[17]:
# Total for each category
fom['I']
[17]:
setosa        40
versicolor    40
virginica     40
dtype: int64
[18]:
# Class sensitivity
fom['CSNS']
[18]:
setosa        0.975
versicolor    0.450
virginica     0.825
dtype: float64
[19]:
# Class specificity
fom['CSPS']
[19]:
setosa        0.7500
versicolor    0.9125
virginica     0.9625
dtype: float64
[20]:
# Class efficiency
fom['CEFF']
[20]:
setosa        0.855132
versicolor    0.640800
virginica     0.891102
dtype: float64

Training a Soft Model

“Soft” PLS-DA models use ellipses to divide the latent (score) space into possibly overlapping regions, meaning that a point can be assigned to 0, 1, or > 1 classes.

[21]:
# Here the data are elemental levels so we will scale the X data
soft_plsda = PLSDA(
    n_components=3,
    alpha=0.05,
    gamma=0.01,
    not_assigned='UNKNOWN',
    style="soft",
    scale_x=True,
    score_metric='TEFF'
)
[22]:
_ = soft_plsda.fit(X_train, y_train)
[27]:
# You can visualize both the hard and soft boundaries if you train a soft model.
# With a hard model, you only get the hard boundaries by default.
_ = soft_plsda.visualize(styles=['hard', 'soft'])
../../_images/jupyter_api_plsda_30_0.png
[28]:
# We can see what samples are predicted to be using the predict() function.
train_pred = soft_plsda.predict(X_train)
[29]:
# As shown above, there are no outliers in the training data.

# Only the soft version of PLSDA can do this check.
soft_plsda.check_outliers().any()
[29]:
False
[30]:
print(
    'Test set TEFF = {}\nTrain set TEFF = {}'.format(
        '%.3f'%soft_plsda.score(X_test, y_test),
        '%.3f'%soft_plsda.score(X_train, y_train)
    )
)
Test set TEFF = 0.983
Train set TEFF = 0.975
[31]:
# Samples can now be predicted to belong to multiple classes.
train_pred[:10]
[31]:
[['setosa'],
 ['virginica'],
 ['versicolor'],
 ['setosa'],
 ['versicolor'],
 ['virginica'],
 ['versicolor'],
 ['versicolor', 'virginica'],
 ['virginica'],
 ['virginica']]
[32]:
# More complete figures of merit can be computed.
fom = soft_plsda.figures_of_merit(train_pred, y_train)

# For Soft PLS-DA TEFF, TSPS, and TSNS are generally different
print(fom['TSNS'], fom['TSPS'], fom['TEFF'])
0.9666666666666667 0.9833333333333333 0.9749643868139777
[33]:
fom['I']
[33]:
setosa        40
versicolor    40
virginica     40
dtype: int64
[34]:
# The soft model can predict that things are unknown and that they might belong to multiple classes.
# Observe that there of 40 versicolor samples, there are 40 assignments to versicolor and 4 to virginica (40 + 4 > 40),
# because some are assigned to both categories.

fom['CM']
[34]:
setosa versicolor virginica UNKNOWN
setosa 38 0 0 2
versicolor 0 39 3 0
virginica 0 1 39 1
[35]:
fom = soft_plsda.figures_of_merit(soft_plsda.predict(X_test), y_test)
fom['CM']
[35]:
setosa versicolor virginica UNKNOWN
setosa 10 0 0 0
versicolor 0 10 1 0
virginica 0 1 10 0
[38]:
# We can plot the test on the training set like this
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,6))
ax = soft_plsda.visualize(styles=['soft'], ax=ax, show_training=False)

test_set_proj = soft_plsda.transform(X_test)
for color_, class_ in zip(['cyan', 'magenta', 'black'], names.values()):
    mask = y_test == class_
    ax.plot(test_set_proj[mask,0], test_set_proj[mask,1], '*', color=color_, label='{} (Test)'.format(class_))
ax.legend(loc='best')
[38]:
<matplotlib.legend.Legend at 0x7ac3538384c0>
../../_images/jupyter_api_plsda_39_1.png

Optimizing the Model#

Here we took alpha as a meaningful choice of type I error rate, but it could also be adjusted. Moreover, we arbitrarily selected the number of PCs to use in the PLSDA model. We can use scikit-learn’s pipelines to automatically optimize hyperparameters like this.

[39]:
pipeline = imblearn.pipeline.Pipeline(
    steps=[
        ("plsda", PLSDA(
            n_components=5,
            alpha=0.05,
            scale_x=True,
            not_assigned='UNKNOWN',
            style='soft',
            score_metric='TEFF'
        )
    )
])

# Let's optimize the TEFF of the model and allow alpha to vary - this is a "compliant" approach.
param_grid = [{
    'plsda__n_components':[3, 4],
    'plsda__alpha': [0.07, 0.05, 0.03, 0.01],
}]

gs = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    cv=sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=0),
    error_score=0,
    refit=True
)

_ = gs.fit(X_train, y_train)
[40]:
# The best parameters found can be accessed like this:
gs.best_params_
[40]:
{'plsda__alpha': 0.03, 'plsda__n_components': 3}
[41]:
gs.best_score_ # The best score (TEFF) it recieved was...
[41]:
0.9727739913024074
[42]:
# You can see detailed CV results here
gs.cv_results_
[42]:
{'mean_fit_time': array([0.01206279, 0.01201566, 0.01257658, 0.01141119, 0.01039139,
        0.00962798, 0.00946887, 0.01093936]),
 'std_fit_time': array([0.00088673, 0.00081104, 0.00274705, 0.00102592, 0.00011664,
        0.00012437, 0.0001569 , 0.00038004]),
 'mean_score_time': array([0.02674349, 0.02722987, 0.02487612, 0.02466146, 0.02466027,
        0.02299309, 0.02398237, 0.02348137]),
 'std_score_time': array([0.00175109, 0.00115925, 0.00034594, 0.00036911, 0.00122064,
        0.00117518, 0.00139078, 0.00226715]),
 'param_plsda__alpha': masked_array(data=[0.07, 0.07, 0.05, 0.05, 0.03, 0.03, 0.01, 0.01],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_plsda__n_components': masked_array(data=[3, 4, 3, 4, 3, 4, 3, 4],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'plsda__alpha': 0.07, 'plsda__n_components': 3},
  {'plsda__alpha': 0.07, 'plsda__n_components': 4},
  {'plsda__alpha': 0.05, 'plsda__n_components': 3},
  {'plsda__alpha': 0.05, 'plsda__n_components': 4},
  {'plsda__alpha': 0.03, 'plsda__n_components': 3},
  {'plsda__alpha': 0.03, 'plsda__n_components': 4},
  {'plsda__alpha': 0.01, 'plsda__n_components': 3},
  {'plsda__alpha': 0.01, 'plsda__n_components': 4}],
 'split0_test_score': array([0.9812301 , 0.9617692 , 0.9812301 , 0.9812301 , 0.9812301 ,
        0.9812301 , 0.949671  , 0.98107084]),
 'split1_test_score': array([0.9367497 , 0.94273538, 0.975     , 0.96856853, 0.98742088,
        0.98742088, 0.96824584, 0.96824584]),
 'split2_test_score': array([0.90260041, 0.92954962, 0.92954962, 0.92954962, 0.949671  ,
        0.93072552, 0.95622957, 0.94356372]),
 'mean_test_score': array([0.9401934 , 0.94468474, 0.96192657, 0.95978275, 0.97277399,
        0.96645883, 0.9580488 , 0.96429347]),
 'std_test_score': array([0.03219266, 0.01322561, 0.02303481, 0.02199409, 0.01653063,
        0.02539335, 0.00769148, 0.01556517]),
 'rank_test_score': array([8, 7, 4, 5, 1, 2, 6, 3], dtype=int32)}
[43]:
# The refit=True (default) refits the model with best parameters on the entire training set at the end so you can
# use the pipeline directly.
gs.predict(X_test)[:10]
[43]:
[['setosa'],
 ['virginica'],
 ['versicolor'],
 ['versicolor'],
 ['setosa'],
 ['versicolor'],
 ['setosa'],
 ['setosa'],
 ['virginica'],
 ['versicolor']]
[44]:
# Within the pipeline you can access individual steps, and their methods.
_ = gs.best_estimator_.named_steps['plsda'].visualize(styles=['hard', 'soft'])
../../_images/jupyter_api_plsda_47_0.png
[45]:
gs.best_estimator_.named_steps['plsda'].score(X_train, y_train) # This is TEFF
[45]:
0.9791578013783069
[46]:
train_pred = gs.best_estimator_.named_steps['plsda'].predict(X_train)
fom = gs.best_estimator_.named_steps['plsda'].figures_of_merit(train_pred, y_train)
[47]:
fom['CM']
[47]:
setosa versicolor virginica UNKNOWN
setosa 39 0 0 1
versicolor 0 40 4 0
virginica 0 2 39 1
[48]:
fom['I']
[48]:
setosa        40
versicolor    40
virginica     40
dtype: int64
[49]:
fom['CSNS']
[49]:
setosa        0.975
versicolor    1.000
virginica     0.975
dtype: float64
[50]:
fom['CSPS']
[50]:
setosa        1.000
versicolor    0.975
virginica     0.950
dtype: float64
[51]:
fom['CEFF']
[51]:
setosa        0.987421
versicolor    0.987421
virginica     0.962419
dtype: float64
[52]:
fom['TSPS'], fom['TSNS'], fom['TEFF']
[52]:
(0.975, 0.9833333333333333, 0.9791578013783069)
[53]:
np.any(gs.best_estimator_.named_steps['plsda'].check_outliers())
[53]:
False
[54]:
# Test set performance
gs.best_estimator_.named_steps['plsda'].score(X_test, y_test) # The score being used here is TEFF
[54]:
0.9746794344808963
[55]:
test_pred = gs.best_estimator_.named_steps['plsda'].predict(X_test)
fom = gs.best_estimator_.named_steps['plsda'].figures_of_merit(test_pred, y_test)
[56]:
fom['CM']
[56]:
setosa versicolor virginica UNKNOWN
setosa 10 0 0 0
versicolor 0 10 1 0
virginica 0 2 10 0