This is part 1 of a 5-part tour of machine learning in Python. In this part I provide a high-level overview of optimization methods for machine learning, and in particular, for training models. Most importantly, we see the motivation for gradient descent and provide some justification for this hugely popular approach, which is the foundation of many many other model training methods. As much as possible, I rely on a “from scratch” approach, avoiding high-level libraries like scikit-learn.
### load dependencies
# essentials
import numpy.linalg as LA # linalg module of numpy
import pandas as pd # pandas for data manipulation
import autograd.numpy as np # autograd-wrapped numpy
# optimization
from autograd import grad # module for computing gradient
from autograd import value_and_grad # returns grad & val of input function
# plotting
import matplotlib.pyplot as plt # pyplot module of matplotlib
# global plotting parameters
= (np.array([94,255,231])/360, np.array([133,94,214])/360,
plt_colours 110,250,152])/360)
np.array([= 14
font_size = {'fontname': 'Avenir'} font_name
In this problem we examine the Breast Cancer Data Set, obtained from the University of Wisconsin Hospitals, Madison from Dr. William H. Wolberg 1.
In the following we implement the softmax and perceptron cost functions.
notes
Below we import the dataset.
### setup dataset
import numpy as np
# data input
= 'data/breast_cancer_data.csv'
csvname = np.loadtxt(csvname, delimiter = ',')
data
# get input and output of dataset
= data[:-1, :] # feature vectors
x = data[-1:, :] # class labels {1,-1} y
We now define our model and complete an implementation of the softmax and perceptron cost functions.
### define model and cost functions
# define model (compute linear combination of input points)
def model(x, w):
= w[0] + np.dot(x.T, w[1:])
a return a.T
## cost functions (`g`)
# softmax cost implementation
# note: x,y req. by cost but def. globally s.t. for given w compute cost
# across all x,y)
def softmax(w):
# note: equivalent to softnax aoprox to perceptron cost, below
= np.sum(np.log(1 + np.exp(-y*model(x, w))))
cost return cost/float(np.size(y))
# perceptron cost implementation
def perceptron(w):
# sum of pair-wise maximum between 0, model values
= np.sum(np.maximum(0, -y*model(x, w)))
cost return cost/float(np.size(y))
We will now optimize these cost functions using gradient descent.
Below we define the standard gradient descent algorithm.
### GD function
def gradient_descent(g, alpha, max_its, w):
import autograd.numpy as np
from autograd import grad
# define alpha based on chosen cost function
# alpha = 1.0 if g == softmax else .1
if g == softmax: alpha = 1.0
elif g == perceptron: alpha = .1
elif g == cross_entropy: alpha = .6
# compute gradient (wrt w, holding constant x,y)
= grad(g)
gradient
# run GD loop
= [w] # weight history container
weight_history = [g(w)] # cost function history container
cost_history for k in range(max_its):
# evaluate the gradient
= gradient(w)
grad_eval
# take gradient descent step
= w - alpha*grad_eval
w
# record weight and cost
weight_history.append(w)
cost_history.append(g(w))
return weight_history, cost_history
Below we define our parameters and then minimize both cost functions.
### optimize cost functions
import autograd.numpy as np
# define gradient descent (GD) parameters
= 1000
max_its = 0.1*np.random.randn(9, 1) # initial starting weights
w
# call GD (choose arb alpha for now bc based on chosen g)
= gradient_descent(softmax, 1,
weight_history_SM, cost_history_SM
max_its, w)= gradient_descent(perceptron, 1,
weight_history_PT, cost_history_PT max_its, w)
We now compute the number of misclassifications for each iteration of gradient descent.
### compute misclassification history
import autograd.numpy as np
# compute misclassification history
def miscount(w, x, y):
# +1 or -1 whether above or below fitted decision bound determined by w
= np.sign(model(x, w))
y_pred
# pairwise compare vectors: 1 if model class pred == true class; sum
= np.sum(y_pred != y)
miscount_curr return miscount_curr
# compute miscount history for each cost function
= [miscount(v, x, y) for v in weight_history_SM]
miscount_history_SM = [miscount(v, x, y) for v in weight_history_PT] miscount_history_PT
Below we plot the cost and misclassification history for each cost function.
### define function for plotting cost and misclassification hist in same figure
def plot_cost_and_miscount(data):
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
# store data
= data["x_plt"]
x_plt = data["y_miscount_SM"]
y_miscount_SM = data["y_cost_SM"]
y_cost_SM = data["y_miscount_PT"]
y_miscount_PT = data["y_cost_PT"]
y_cost_PT
# use seaborn style
sns.set_theme()
# to prevent warning messages from appearing in the report
import warnings;
'ignore')
warnings.filterwarnings(
# set inline figure format/quality, overall params
# %config InlineBackend.figure_format = 'svg'
# setup plot & subplots
= plt.subplots(2, 1, figsize = np.array([6.5, 8]),
fig, axs = True) # overall layout, size
constrained_layout # axs[0] = plt.subplot(1,3,(1,2), frameon = 1)
0] = plt.subplot(211, frameon = 1)
axs[1] = plt.subplot(212, frameon = 1)
axs[
# set axis tick sizes
0].xaxis.set_tick_params(labelsize = font_size - 4)
axs[0].yaxis.set_tick_params(labelsize = font_size - 4)
axs[1].xaxis.set_tick_params(labelsize = font_size - 4)
axs[1].yaxis.set_tick_params(labelsize = font_size - 4)
axs[
# plot overlay of misclassifications & costs for each cost over it
= axs[0].plot(x_plt, y_miscount_SM,
line_miscount_SM = plt_colours[0],
color = '-', linewidth = 2)
linestyle = axs[0].plot(x_plt, y_miscount_PT,
line_miscount_PT = plt_colours[1],
color = '-', linewidth = 2)
linestyle = axs[1].plot(x_plt, y_cost_SM,
line_cost_SM = plt_colours[0],
color = '-', linewidth = 2)
linestyle = axs[1].plot(x_plt, y_cost_PT,
line_cost_PT = plt_colours[1],
color = '-', linewidth = 2)
linestyle
# add labels
# axs[0].set_xlabel('iteration', fontsize = font_size, **font_name)
0].set_xlabel('iteration', fontsize = font_size)
axs[0].set_ylabel('misclassifications', fontsize = font_size)
axs[1].set_xlabel('iteration', fontsize = font_size)
axs[1].set_ylabel('cost', fontsize = font_size)
axs[
# plot legend
= ['softmax', 'perceptron']
leg_labels 0].legend([line_miscount_SM, line_miscount_PT], labels = leg_labels,
axs[= 'upper right', frameon = 1, fontsize = font_size - 4)
loc 1].legend([line_cost_SM, line_cost_PT], labels = leg_labels,
axs[= 'upper right', frameon = 1, fontsize = font_size - 4)
loc
plt.show()
### store data and plot
import pandas as pd
# store plotting data in data frame
= pd.DataFrame(
plot_data
{"x_plt": np.arange(np.size(miscount_history_SM)),
"y_miscount_SM": miscount_history_SM,
"y_cost_SM": cost_history_SM,
"y_miscount_PT": miscount_history_PT,
"y_cost_PT": cost_history_PT
})
# plot
plot_cost_and_miscount(plot_data)
Below we identify and return the minimum number of misclassifications for each cost function over all iterations of gradient descent.
### identify minima
= min(miscount_history_SM)
min_misclass_SM = min(miscount_history_PT)
min_misclass_PT
print("softmax minimum misclassifications: " + str(min_misclass_SM))
print("perceptron minimum misclassifications: " + str(min_misclass_PT))
Below we modify the miscount
function to consider only misclassified
malignant cases (i.e., cases corresponding to the class label \(-1\)).
#### define misclassification rate
import autograd.numpy as np
# compute misclassification history
def miscount_neg(w, x, y):
# +1 or -1 whether above or below fitted decision bound determined by w
= np.sign(model(x, w))
y_pred
# consider only malignant cases
= np.sum(y_pred[y == -1] != -1)
miscount_curr return miscount_curr
# compute new miscount history for each cost function
= [miscount_neg(v, x, y) for v in weight_history_SM]
miscount_history_SM_neg = [miscount_neg(v, x, y) for v in weight_history_PT] miscount_history_PT_neg
Below we again plot misclassification and cost history side-by-side, this time considering only misclassified malignant cases.
### store data and plot
# import pandas for data frame capabilities
import pandas as pd
# store plotting data in data frame
= pd.DataFrame(
plot_data
{"x_plt": np.arange(np.size(miscount_history_SM_neg)),
"y_miscount_SM": miscount_history_SM_neg,
"y_cost_SM": cost_history_SM,
"y_miscount_PT": miscount_history_PT_neg,
"y_cost_PT": cost_history_PT
})
# plot
plot_cost_and_miscount(plot_data)
In the following we compare the above results to those obtained using logistic regression with a cross entropy cost.
The code below (ref) converts our class labels y
= \(\{-1, 1\}\) to
\(\{0, 1\}\).
### setup dataset
= np.squeeze(y) # flatten y (one dimension)
y_1
= np.argwhere(y_1 > 0.9) # returns all indices where y meets condition
y_ben = np.argwhere(y_1 < -0.9)
y_mal
= np.arange(699) # class labels (`y`) for CE (cross-entropy) cost
yc = 1
yc[y_ben] = 0 yc[y_mal]
The code below (ref) implements the cross entropy cost function, with L2 regularization.
### define model and cross entropy cost
# define sigmoid function
def sigmoid(t):
return 1/(1 + np.exp(-t))
## the (convex) cross-entropy cost function
= 2*10**(-3) # regularization parameter
lam def cross_entropy(w):
# compute sigmoid of model
= sigmoid(model(x, w))
sig_eval
# compute cost of label 0 points
= np.argwhere(yc == 0)
ind_mal = -np.sum(np.log(1 - sig_eval[:, ind_mal]))
cost
# add (subtract) cost on label 1 points
= np.argwhere(yc == 1)
ind_ben -= np.sum(np.log(sig_eval[:, ind_ben]))
cost
# add regularizer (* regularization parameter)
+= lam*np.sum(w[1:]**2)
cost
# compute cross-entropy
return cost/float(np.size(yc))
Below we run gradient descent to optimize the cross entropy cost function for logistic regression.
### optimize cost function using GD
import autograd.numpy as np
# define gradient descent parameters
= 1000
max_its = 0.1*np.random.randn(9, 1) # initial starting weights
w
# call GD (choose arb alpha for now bc based on chosen g)
= gradient_descent(cross_entropy, 1,
weight_history_CE, cost_history_CE max_its, w)
We now compute misclassification history. To do so we must modify
miscount
to compute misclassifications in a manner appropriate to
logistic regression. Since the sigmoid function varies between 0 and 1
and is symmetric (about a \(180^{\circ}\) rotation centered at
\((0, 0.5)\)), we can use \(y = 0.5\) to effectively separate our
predictions into the two classes labelled \(\{0, 1\}\). Then, as before,
we can determine misclassiffications by comparing the predicted classes
across weight_history
with the true class labels.
### compute misclassification history
import autograd.numpy as np
# compute misclassification history
def miscount_CE(w, x, y):
= np.sign(model(x, w)) # 1 or 1 if eval > .5
y_pred
# convert predictions to classes based on y = .5 threshold value
>= .5] = 1
y_pred[y_pred < .5] = 0
y_pred[y_pred
# pairwise compare vectors: 1 if model class pred == true class; sum
= np.sum(y_pred != y)
miscount_curr
return miscount_curr
# compute new miscount history for cross entropy cost
= [miscount_CE(v, x, yc) for v in weight_history_CE] miscount_history_CE
And finally, below we plot cost and misclassification history for the logistic regression cross entropy cost, as we did above for the softmax and perceptron cost functions (alongside the results for the perceptron cost function). First we make a couple minor modifications to generalize our plotting function and allow us to plot cross entropy cost in addition to softmax cost.
### define plotting function
def plot_cost_and_miscount(data):
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
# store data
= data["x_plt"]
x_plt # y_miscount_1 = data["y_miscount_CE"]
# y_cost_1 = data["y_cost_CE"]
# y_miscount_2 = data["y_miscount_PT"]
# y_cost_2 = data["y_cost_PT"]
= data.iloc[:, 1]
y_miscount_1 = data.iloc[:, 2]
y_cost_1 = data.iloc[:, 3]
y_miscount_2 = data.iloc[:, 4]
y_cost_2
# use seaborn style
sns.set_theme()
# to prevent warning messages from appearing in the report
import warnings;
'ignore')
warnings.filterwarnings(
# set inline figure format/quality, overall params
# %config InlineBackend.figure_format = 'svg'
# setup plot & subplots
= plt.subplots(2, 1, figsize = np.array([6.5, 8]),
fig, axs = True) # overall layout, size
constrained_layout # axs[0] = plt.subplot(1,3,(1,2), frameon = 1)
0] = plt.subplot(211, frameon = 1)
axs[1] = plt.subplot(212, frameon = 1)
axs[
# set axis tick sizes
0].xaxis.set_tick_params(labelsize = font_size - 4)
axs[0].yaxis.set_tick_params(labelsize = font_size - 4)
axs[1].xaxis.set_tick_params(labelsize = font_size - 4)
axs[1].yaxis.set_tick_params(labelsize = font_size - 4)
axs[
# plot overlay of misclassifications & costs for each cost over all it
= axs[0].plot(x_plt, y_miscount_1,
line_miscount_CE = plt_colours[0],
color = '-', linewidth = 2)
linestyle = axs[0].plot(x_plt, y_miscount_2,
line_miscount_PT = plt_colours[1],
color = '-', linewidth = 2)
linestyle = axs[1].plot(x_plt, y_cost_1,
line_cost_CE = plt_colours[0],
color = '-', linewidth = 2)
linestyle = axs[1].plot(x_plt, y_cost_2,
line_cost_PT = plt_colours[1],
color = '-', linewidth = 2)
linestyle
# add labels
# axs[0].set_xlabel('iteration', fontsize = font_size, **font_name)
0].set_xlabel('iteration', fontsize = font_size)
axs[0].set_ylabel('misclassifications', fontsize = font_size)
axs[1].set_xlabel('iteration', fontsize = font_size)
axs[1].set_ylabel('cost', fontsize = font_size)
axs[
# plot legend
= ['cross entropy', 'perceptron']
leg_labels 0].legend([line_miscount_CE, line_miscount_PT], labels = leg_labels,
axs[= 'upper right', frameon = 1, fontsize = font_size - 4)
loc 1].legend([line_cost_CE, line_cost_PT], labels = leg_labels,
axs[= 'upper right', frameon = 1, fontsize = font_size - 4)
loc
plt.show()
### store data and plot
import pandas as pd
# store plotting data in data frame
= pd.DataFrame(
plot_data
{"x_plt": np.arange(np.size(miscount_history_CE)),
"y_miscount_CE": miscount_history_CE,
"y_cost_CE": cost_history_CE,
"y_miscount_PT": miscount_history_PT,
"y_cost_PT": cost_history_PT
})
# plot
plot_cost_and_miscount(plot_data)
In this problem we examinine the Spambase Data Set 2, containing measurements of spam and non-spam emails. Using the two-class classification methods we explored in the above sections, we build a classifier to determine if an email is likely to be spam.
notes
Class labels (1 for ‘spam’, -1 for ‘not spam’) are again stored in the final row (\(N = 58\)) of the dataset, with each other row \(n\) representing a feature (word/character frequencies, etc.) and each column \(p\) a sample (\(P = 4601\)).
Below we import the dataset.
### import dataset
import numpy as np
# data input
= 'data/spambase_data.csv'
csvname = np.loadtxt(csvname, delimiter = ',')
data
# get input and output of dataset
= data[:-1, :] # feature vectors
x = data[-1:, :] # class labels {1,-1} y
We now perform some preproc`essing to prepare the dataset for our analysis. We standard normalize each feature (s.t. \(\mu_n = 0\) and \(\sigma_n = 1\), where \(\mu\) represents the mean and \(\sigma\) the standard deviation) so that they can be treated equally by our classifier. To do so, for each observation \(x_{n,p}\) we subtract the corresponding mean \(\mu_n\) and divide by the corresponding standard deviation \(\sigma_n\).
To deal with missing observations for a given feature (represented as
NaN
), we replace NaN
with the corresponding feature mean \(\mu_n\).
The following function (ref) executes the above.
### perform standard normalization and fill in missing data
def standard_normalizer(x):
# compute the mean and standard deviation of each feature
= np.nanmean(x, axis = 1)[:, np.newaxis]
x_means = np.nanstd(x, axis = 1)[:, np.newaxis]
x_stds
# check to make sure that x_stds > small threshold; for those not
# divide by 1 instead of original standard deviation
= np.argwhere(x_stds < 10**(-2))
ind if len(ind) > 0:
= [v[0] for v in ind] # just keep row index
ind = np.zeros((x_stds.shape)) # array of indices to replace with 1
adjust = 1.0
adjust[ind] += adjust
x_stds
# fill in any nan values with corresponding feature mean
= np.argwhere(np.isnan(x) == True)
ind for i in ind:
0], i[1]] = x_means[i[0]]
x[i[
# create standard normalizer function
= lambda data: (data - x_means)/x_stds
normalizer
# create inverse standard normalizer
= lambda data: data*x_stds + x_means
inverse_normalizer
# return (inverse) normalizer
return normalizer, inverse_normalizer
## standard normalize dataset
= standard_normalizer(x) # get std norm functions
normalizer, inverse_normalizer = normalizer(x) x
In the following we classify emails using the softmax and perceptron cost functions.
Steps:
We carry out the same sequence of steps covered in detail in part 1:
Identify optimal decision boundary (determined by weights \(w\)):
max its = 1000
;
w = 0.1*np.random.randn(N + 1, 1)
, where \(N = 57\)Compute misclassification histories
Plot cost and misclassification as a function of gradient descent iteration
### 1) optimize softmax and perceptron costs over `spambase` dataset
import autograd.numpy as np
# define gradient descent parameters
= 1000
max_its = 0.1*np.random.randn(np.shape(x)[0] + 1, 1) # initial starting weights
w
# call GD (choose arb alpha for now bc based on chosen g)
= gradient_descent(softmax, 1,
weight_history_SM_spam, cost_history_SM_spam
max_its, w)= gradient_descent(perceptron, 1,
weight_history_PT_spam, cost_history_PT_spam max_its, w)
### 2) compute misclassification history for each cost function
= [miscount(v, x, y) for v in weight_history_SM_spam]
miscount_history_SM_spam = [miscount(v, x, y) for v in weight_history_PT_spam] miscount_history_PT_spam
### 3) for each cost, plot cost and misclassification as a function of iteration
## store data and plot
import pandas as pd
# store plotting data in data frame
= pd.DataFrame(
plot_data
{"x_plt": np.arange(np.size(miscount_history_SM_spam)),
"y_miscount_SM": miscount_history_SM_spam,
"y_cost_SM": cost_history_SM_spam,
"y_miscount_PT": miscount_history_PT_spam,
"y_cost_PT": cost_history_PT_spam
})
## plot
plot_cost_and_miscount(plot_data)
The plots above reveal some interesting results. The perceptron cost appears to attain a stable minimum cost, but its misclassification history appears to fluctuate rapidly between ~250 and 750 misclassifications. On the other hand, the softmax cost appears to attain a stable, albeit higher, minimum cost, but its misclassification history reaches a similar minimum to the softmax cost and is stable (does not fluctate rapidly). This suggests that the perceptron cost function is not perfectly flat at the identified minimum, causing minor perturbations in weight to produce large deviations from the minimum. It is possible that different (non-standard) gradient descent schemes (e.g., fully normalized) would eliminate this behaviour.
In the following we determine the misclassification minimum and convert it to accuracy.
Recall: we define accuracy, \(\mathcal{A}\) as
\[ \mathcal{A} = 1 - \frac{1}{P} \sum\limits_{p = 1}^P I(\hat{y}_p, y_p) , \]
where \(\sum\limits_{p = 1}^P I\) represents total misclassifications, which we average over all samples \(P\) and subtract from 1.
### determine minimum misclassifications for each cost and convert to accuracy
# identify minima
= min(miscount_history_SM_spam)
min_misclass_SM_spam = min(miscount_history_PT_spam)
min_misclass_PT_spam
print("softmax minimum misclassifications: " + str(min_misclass_SM_spam))
print("perceptron minimum misclassifications: " + str(min_misclass_PT_spam))
# convert to accuracies
= 1 - (min_misclass_SM_spam)/np.shape(x)[1]
accuracy_SM_spam = 1 - (min_misclass_PT_spam)/np.shape(x)[1]
accuracy_PT_spam
print("\nsoftmax accuracy: " + str(accuracy_SM_spam))
print("perceptron accuracy: " + str(accuracy_PT_spam))
As we can see above, both classifiers, trained using a softmax and perceptron cost, respectively, appear to perform relatively well, with accuracies above 90%. However, softmax cost performs slighly better, with 3 fewer misclassifications than the perceptron. This result reflects our expectations based on the misclassification and cost histories plotted above.
In the following we identify the optimal w
found by the softmax cost
and construct the corresponding confusion matrix 3.
### construct confusion matrix for optimal softmax `w`
## define model again (compute linear combination of input points)
def model(x, w):
= w[0] + np.dot(x.T, w[1:])
a return a.T
## identify optimal `w` for cost
= weight_history_SM_spam[np.argmin(miscount_history_SM_spam)]
w_optimal_SM_spam # w_optimal_PT_spam = weight_history_PT_spam[np.argmin(miscount_history_PT_spam)]
### function for plotting confusion matrix
def plot_confusion_matrix(w, x, y):
import seaborn as sns
import pandas as pd
import numpy as np
# +1 or -1 whether above or below fitted decision bound determined by w
= np.sign(model(x, w))
y_pred
## determine confusion matrix values (true pos/neg, false pos/neg)
= np.sum(y_pred[y == 1] == 1)
TP = np.sum(y_pred[y == -1] == -1)
TN = np.sum(y_pred[y == -1] != -1)
FP = np.sum(y_pred[y == 1] != 1)
FN
# store values in dictionary, matrix for plotting
= {"TP": TP, "TN": TN, "FP": FP, "FN": FN}
confusion_matrix_OP # confusion_matrix_OP["TN"] = TN
= np.array([[TP, FN],
conf_matrix
[FP, TN]])
## plot confusion matrix
# set annotations for tiles based on counts
= ['TP', 'FN', 'FP', 'TN']
group_names = ["{0:0.0f}".format(value) for value in conf_matrix.flatten()]
group_counts = ["{0:.2%}".format(value) for value in
group_percentages /np.sum(conf_matrix)]
conf_matrix.flatten()
= [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
tile_labels zip(group_names, group_counts, group_percentages)]
= np.asarray(tile_labels).reshape(2, 2)
tile_labels
# plot
= sns.heatmap(conf_matrix, annot = tile_labels, fmt = '', cmap = 'Blues')
ax
# labels
'\nPredicted Values', fontsize = font_size)
ax.set_xlabel('Actual Values\n', fontsize = font_size);
ax.set_ylabel('Spam', 'Not spam'], fontsize = font_size)
ax.xaxis.set_ticklabels(['Spam', 'Not spam'], fontsize = font_size)
ax.yaxis.set_ticklabels([
plt.show()
return confusion_matrix_OP
## plot confusion matrix for softmax and optimal weight
= plot_confusion_matrix(w_optimal_SM_spam, x, y) confusion_matrix
In this problem we examinine a dataset containing credit ratings, described in Example 6.11 of Machine Learning Refined.
notes
Below we import the dataset.
### import dataset
import numpy as np
# data input
= 'data/credit_dataset.csv'
csvname = np.loadtxt(csvname, delimiter = ',')
data
# get input and output of dataset
= data[:-1, :] # feature vectors
x = data[-1:, :] # class labels {1,-1} y
Below we standard normalize the dataset so that features can be compared.
### perform standard normalization
= standard_normalizer(x) # get std norm functions
normalizer, inverse_normalizer = normalizer(x) x
Below we fit a binary classifier using a perceptron cost.
### optimize perceptron cost over `credit rating` dataset
import autograd.numpy as np
# define gradient descent parameters
= 1000
max_its = 0.1*np.random.randn(np.shape(x)[0] + 1, 1) # initial starting weights
w
# call GD (choose arb alpha for now bc based on chosen g)
= gradient_descent(perceptron, 1,
weight_history_PT_cred, cost_history_PT_cred max_its, w)
### compute misclassification history
= [miscount(v, x, y) for v in weight_history_PT_cred] miscount_history_PT_cred
### store data for plotting
import pandas as pd
# store plotting data
= np.arange(np.size(miscount_history_PT_cred))
x_plt = miscount_history_PT_cred
y_miscount_PT = cost_history_PT_cred y_cost_PT
### plot cost and misclassification history for perceptron cost
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
# use seaborn style
sns.set_theme()
# to prevent warning messages from appearing in the report
import warnings;
'ignore')
warnings.filterwarnings(
# set inline figure format/quality, overall params
# %config InlineBackend.figure_format = 'svg'
# setup plot & subplots
= plt.subplots(2, 1, figsize = np.array([6.5, 8]),
fig, axs = True) # overall layout, size
constrained_layout # axs[0] = plt.subplot(1,3,(1,2), frameon = 1)
0] = plt.subplot(211, frameon = 1)
axs[1] = plt.subplot(212, frameon = 1)
axs[
# set axis tick sizes
0].xaxis.set_tick_params(labelsize = font_size - 4)
axs[0].yaxis.set_tick_params(labelsize = font_size - 4)
axs[1].xaxis.set_tick_params(labelsize = font_size - 4)
axs[1].yaxis.set_tick_params(labelsize = font_size - 4)
axs[
# plot overlay of misclassifications & costs for each cost over all it
= axs[0].plot(x_plt, y_miscount_PT,
line_miscount_CE = plt_colours[0],
color = '-', linewidth = 2)
linestyle = axs[1].plot(x_plt, y_cost_PT,
line_cost_CE = plt_colours[0],
color = '-', linewidth = 2)
linestyle
# add labels
# axs[0].set_xlabel('iteration', fontsize = font_size, **font_name)
0].set_xlabel('iteration', fontsize = font_size)
axs[0].set_ylabel('misclassifications', fontsize = font_size)
axs[1].set_xlabel('iteration', fontsize = font_size)
axs[1].set_ylabel('cost', fontsize = font_size)
axs[
plt.show()
### compute and output perceptron accuracy
= min(miscount_history_PT_cred)
min_misclass_PT_cred = 1 - (min_misclass_PT_cred)/np.shape(x)[1]
accuracy_PT_cred print("perceptron accuracy: " + str(accuracy_PT_cred))
The perceptron achieves a classification accuracy of ~76%, which is within the range of the accuracy achieved in Machine Learning Refined.
We now plot the confusion matrix corresponding to the optimal weight identified by the perceptron.
### plot confusion matrix for perceptron and optimal weight
= weight_history_PT_cred[np.argmin(miscount_history_PT_cred)]
w_optimal_PT_cred = plot_confusion_matrix(w_optimal_PT_cred, x, y) confusion_matrix
In this problem we examine a three-class toy dataset, shown in Fig. 7.9 of Machine Learning Refined, using the multi-class perceptron. Additional resources (including plotting functions) are provided in the official ‘MLR’ git repository 4.
Below we fit the multi-class perceptron to the dataset.
### import dataset & standard normalize
import numpy as np
# data input
= 'data/3class_data.csv'
csvname = np.loadtxt(csvname, delimiter = ',')
data
# get input and output of dataset
= data[:-1, :] # feature vectors
x = data[-1:, :] # class labels {1,-1}
y
## normalize
= standard_normalizer(x) # get std norm functions
normalizer, inverse_normalizer = normalizer(x) x
### define model and multi-class perceptron (ref)
## compute C linear combinations of input point, one per classifier
def model(x, w):
= w[0] + np.dot(x.T, w[1:])
a return a.T
## multi-class perceptron
= 10**-5 # regularization paramter
lam def multiclass_perceptron(w):
# pre-compute predictions on all points
= model(x, w)
all_evals
# compute maximum across data points
= np.max(all_evals, axis = 0)
a
# compute cost in compact form using numpy broadcasting
= all_evals[y.astype(int).flatten(), np.arange(np.size(y))]
b = np.sum(a - b)
cost
# add regularizer
= cost + lam*np.linalg.norm(w[1:, :], 'fro')**2
cost
# return average
return cost/float(np.size(y))
### fit multi-class perceptron
import autograd.numpy as np
# define gradient descent parameters
= 1000
max_its = 0.1*np.random.randn(3, 3) # initial starting weights
w
# call GD (choose arb alpha for now bc based on chosen g)
= gradient_descent(
weight_history_3CPT, cost_history_3CPT .1, max_its, w) multiclass_perceptron,
### plot cost history
## store data
= np.arange(np.shape(weight_history_3CPT)[0])
x_plt = cost_history_3CPT
y_cost_3CPT
## plot
def plot_cost_history(x, y):
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
# use seaborn style
sns.set_theme()
# setup plot
= plt.subplots(1, 1, figsize = np.array([5, 5]),
fig, ax = True) # overall layout, size
constrained_layout = plt.subplot(111, frameon = 1)
ax
# set axis tick sizes
= font_size - 4)
ax.xaxis.set_tick_params(labelsize = font_size - 4)
ax.yaxis.set_tick_params(labelsize
# plot cost history
= ax.plot(x, y, color = plt_colours[0],
y_cost_3CPT = '-', linewidth = 2)
linestyle
# add labels
'iteration', fontsize = font_size)
ax.set_xlabel('cost', fontsize = font_size)
ax.set_ylabel(
plt.show()
plot_cost_history(x_plt, y_cost_3CPT)
1]
weight_history_3CPT[
model(x, w)
np.shape(model(x, w))
= model(x, weight_history_3CPT[1])
all_evals
np.shape(all_evals)max(all_evals, axis = 0)) np.shape(np.
### define new function to compute multi-class misclassification history
def miscount_3CPT(w, x, y):
# 0-2 based on weight that maximizes prediction
# y_pred = np.argmax(np.abs(model(x, w), axis = 0), axis = 0)
# compute prediction for all points
= model(x, w)
all_evals
# assign point class of DB that maximumizes distance to DB
= np.argmax(all_evals, axis = 0)
y_pred
# pairwise compare vectors: 1 if model class pred == true class; sum
= np.sum(y_pred != y)
miscount_curr return miscount_curr
### compute accuracy for optimal weight (minimum misclassifications)
# compute misclassifications
= [miscount_3CPT(v, x, y) for v in weight_history_3CPT]
miscount_history_3CPT
# find minimum misclassifications and convert to accuracy
= min(miscount_history_3CPT)
min_misclass_3CPT print("multi-class perceptron minimum misclassifications: "
+ str(min_misclass_3CPT))
# convert to accuracies
= 1 - (min_misclass_3CPT)/np.shape(x)[1]
accuracy_3CPT
print("multi-class perceptron accuracy: " + str(accuracy_3CPT))
Below we plot the data in the plane, along with the decision boundary obtained by the multi-class perceptron.
# code to clone library from git repo subdirectory
# (from Ciro Santilli @ https://stackoverflow.com/questions/600079/how-do-i-clone-a-subdirectory-only-of-a-git-repository)
# git clone \
# --depth 1 \
# --filter=blob:none \
# --sparse \
# https://github.com/jermwatt/machine_learning_refined \
# ;
# cd machine_learning_refined
# git sparse-checkout set mlrefined_libraries
### plot data and decision boundary
# import relevant MLR plotting libraries (store lib in current dir)
from mlrefined_libraries import superlearn_library as superlearn
from mlrefined_libraries import math_optimization_library as optlib
= optlib.optimizers
optimizers = superlearn.multi_lin_classification_demo
classif_plotter = superlearn.cost_functions
cost_lib = superlearn.normalizers
normalizers
# store std. normalized dataset to pass to function (colours = class labels)
= np.append(x, y, axis = 0)
data_norm -1,0] = 0
data_norm[
# plot individual data points in plane
= superlearn.multiclass_illustrator.Visualizer(data_norm)
demo
demo.show_dataset()
# add DBs (pass weights defining 3 class DBs that give perfect accuracy)
-1]], cost =
demo.show_complete_coloring([weight_history_3CPT[ multiclass_perceptron)
The multi-class perceptron achieves a classification accuracy of 100%, yielding results that are comparable to those shown in Machine Learning Refined.
Wolberg, W. H., Street, W. N., & Mangasarian, O. L. (1992). Breast Cancer Dataset. University of Wisconsin Hospitals, Madison.↩︎
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml].
Irvine, CA: University of California, School of Information and
Computer Science.
↩︎
Watt, J., Borhani, R., & Katsaggelos, A. K. (2020). Machine learning refined: Foundations, algorithms, and applications. Cambridge University Press.↩︎