# import required packages
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# load data
bc = load_breast_cancer()

# normalize
x_min = np.min(bc.data, 0)
x_scale = 3*np.std(bc.data-x_min, 0)
bc_norm = (bc.data-x_min) / x_scale

# split data into training and validation set
train_X, val_X, train_y, val_y = train_test_split(bc_norm, bc.target, random_state=42)
class_labels = bc.target_names

# plot shapes
print(train_X.shape, train_y.shape, val_X.shape, val_y.shape)

(426, 30) (426,) (143, 30) (143,)


# learning rate is the step size as in classical gradient descent
l_rate = 1e-3

# epochs is the number of maximum iterations for minimization
epochs = 700

# batch size amounts to the number of samples used in one epoch iteration (up to hardware memory)
b_size = 2**4
assert b_size <= train_X.shape[0]

# insert column of ones as first feature entry to cover bias as a trainable parameter within the weight vector (instead of separate variable)
train_X, val_X = np.c_[np.ones(train_X.shape[0]), train_X], np.c_[np.ones(val_X.shape[0]), val_X]

# initialize weight vector such it has the same number of columns as input features
np.random.seed(1111)
w = np.random.uniform(size=(train_X.shape[1],)) * 0.1

# initialize a list to track the loss value for each epoch
loss_list = []

# batch composition
def next_batch(X, y, b_size):
    
    # loop over our dataset in mini-batches
    for i in np.arange(0, X.shape[0], b_size):
        
        # yield a tuple for current batch of data and labels
        yield (X[i:i+b_size], y[i:i+b_size])

for epoch in range(epochs+1):
    
    # reset total epoch loss
    epoch_loss = []
    
    # loop over data in batches
    for (batch_X, batch_y) in next_batch(train_X, train_y, b_size):
        
        # take dot product between current feature batch and weights
        preds_y = np.dot(batch_X, w)
        
        # compare prediction and true values
        diff = preds_y - batch_y
        
        # compute mean of squared loss for current batch
        epoch_loss.extend(diff**2)

        # compute the derivative
        gradient = 2 * np.dot(batch_X.T, diff)
        
        # scale gradient of current batch to step in the correct direction
        w -= l_rate * gradient
        
    # update loss list by taking average across all batches
    loss_list.append(np.mean(epoch_loss))


%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15, 5))
# plot loss function
plt.plot(range(len(loss_list)), loss_list)
plt.title('Training Loss', fontsize=14)
plt.xlabel('Epoch #', fontsize=14)
plt.ylabel('$L(\mathbf{w})$', fontsize=14)
plt.show()


sigmoid = lambda y: 1.0 / (1 + np.exp(-y))


# threshold estimation
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(train_y, np.dot(train_X, w))
tau = thresholds[np.argmax(tpr - fpr)]

# compute predictions from test set
pred_y = (sigmoid(np.dot(val_X, w)) >= tau).astype('uint8')


# compose confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true=val_y, y_pred=pred_y)

# print confusion matrix
fig, ax2 = plt.subplots(1, 1, figsize=(15,5))
ax2.matshow(conf_mat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        ax2.text(x=j, y=i, s=conf_mat[i, j], va='center', ha='center', size='xx-large')

ax2.set_title('Validation Confusion Matrix', fontsize=14)
ax2.set_xlabel('Predictions $\mathbf{\hat{y}}$', fontsize=14)
ax2.set_ylabel('Actuals $\mathbf{y}$', fontsize=14)
ax2.set_yticks([0, 1])
ax2.set_xticks([0, 1])
ax2.set_yticklabels(class_labels)
ax2.set_xticklabels(class_labels)
ax2.tick_params(top=False, bottom=True, labeltop=False, labelbottom=True)
plt.show()


from matplotlib.animation import FuncAnimation
from IPython.display import HTML
#plt.style.use('seaborn-notebook')

# animated figure that plots the loss over time
fig, ax = (plt.figure(figsize=(8, 5)), plt.axes())
ax.set_xlabel('Epoch #', fontsize=16)
ax.set_ylabel('Loss $F(\mathbf{w})$', fontsize=16)
ax.xaxis.set_tick_params(labelsize=12)
ax.yaxis.set_tick_params(labelsize=12)
line, = ax.semilogy(range(len(loss_list)), loss_list, lw=2)
point, = ax.plot(0, np.nan, 'r', marker='.', markersize=14)
plt.tight_layout()
plt.close()
# animation
div = 50
def animate(i):
    line.set_data(np.arange(len(loss_list))[:i*div], loss_list[:i*div])
    point.set_data(i*div, loss_list[i*div])
    return line, point

anim = FuncAnimation(fig, animate, interval=div, frames=epochs//div+1)

if False:
    anim.save('../img/sgd_anim.gif', writer='imagemagick', fps=5)

    HTML(anim.to_jshtml())


# TP/(TP+FP)
precision = lambda conf_mat: conf_mat[0, 0]/(conf_mat[0, 0]+conf_mat[1, 0]) * 100
print('%.2f %% Precision' % precision(conf_mat))

# TP/(TP+FN)
recall = lambda conf_mat: conf_mat[0, 0]/(conf_mat[0, 0]+conf_mat[0, 1]) * 100
print('%.2f %% Recall' % recall(conf_mat))

# (TP+TN)/ALL
accuracy = lambda conf_mat: (conf_mat[0, 0]+conf_mat[1, 1])/np.sum(conf_mat) * 100
print('%.2f %% Accuracy' % accuracy(conf_mat))

98.11 % Precision
96.30 % Recall
97.90 % Accuracy


# mean squared error
MSE = lambda y, pred_y: np.round(sum((y-pred_y)**2)/len(y), 4)

# compute predictions of test and training sets
pred_val_y = np.round(sigmoid(np.dot(val_X, w))).astype('uint8')
pred_train_y = np.round(sigmoid(np.dot(train_X, w))).astype('uint8')

# compare MSEs
val_MSE = MSE(val_y, pred_val_y)
train_MSE = MSE(train_y, pred_train_y)
res = np.isclose(val_MSE, train_MSE, rtol=.95)

if res:
    print('MSEs are close enough with %s (test) and %s (train).' % (val_MSE, train_MSE))
else:
    print('Potential over-/underfitting from MSEs with %s (test) and %s (train).' % (val_MSE, train_MSE))

MSEs are close enough with 0.2238 (test) and 0.2582 (train).


import torch
torch.manual_seed(1111)

# define single layer model
class SingleLayerNet(torch.nn.Module):
    def __init__(self, n_features):
        super(SingleLayerNet, self).__init__()
        self.linear = torch.nn.Linear(n_features, 1, bias=True)
        torch.nn.init.uniform_(self.linear.weight, 0, 1e-1)
        
    def forward(self, X):
        z = self.linear(X)
        return torch.squeeze(z, 1)
    
# convert to torch data types
train_Xt = torch.autograd.Variable(torch.FloatTensor(train_X))
train_yt = torch.autograd.Variable(torch.FloatTensor(train_y))
val_Xt = torch.autograd.Variable(torch.FloatTensor(val_X))
val_yt = torch.autograd.Variable(torch.FloatTensor(val_y))

# instantiate model and loss
model = SingleLayerNet(n_features=train_Xt.shape[1])
optimizer = torch.optim.SGD(model.parameters(), lr=l_rate)
criterion = torch.nn.MSELoss()

# training
loss_list = []
for epoch in range(epochs+1):
    # loop over data in batches
    for i_X, i_y in next_batch(train_Xt, train_yt, b_size):
        pred_y = model(i_X)
        loss = criterion(pred_y, i_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # track loss
    loss_list.append(loss.item())
    if epoch % 100 == 0:
        y_val_pred = model(val_Xt)
        y_train_pred = model(train_Xt)
        val_loss = criterion(y_val_pred, val_yt)
        train_loss = criterion(y_train_pred, train_yt)
        pred_yt = sigmoid(y_val_pred.detach().numpy()) >= tau
        conf_mat = confusion_matrix(y_true=val_yt.numpy(), y_pred=pred_yt)
        val_acc = accuracy(conf_mat)
        train_acc = accuracy(conf_mat)
        print(
            f'''epoch {epoch}
            Train set - loss: {np.round(train_loss.detach().numpy().astype('float'), 3)}, accuracy: {np.round(train_acc, 3)}
            Validation  set - loss: {np.round(val_loss.detach().numpy().astype('float'), 3)}, accuracy: {np.round(val_acc, 3)}
            ''')

if False:
    torch.save(model, 'torch_sgd_model.pth')

epoch 0
            Train set - loss: 0.445, accuracy: 12.587
            Validation  set - loss: 0.448, accuracy: 12.587
            
epoch 100
            Train set - loss: 0.102, accuracy: 93.007
            Validation  set - loss: 0.087, accuracy: 93.007
            
epoch 200
            Train set - loss: 0.081, accuracy: 95.105
            Validation  set - loss: 0.073, accuracy: 95.105
            
epoch 300
            Train set - loss: 0.073, accuracy: 95.804
            Validation  set - loss: 0.067, accuracy: 95.804
            
epoch 400
            Train set - loss: 0.069, accuracy: 96.503
            Validation  set - loss: 0.064, accuracy: 96.503
            
epoch 500
            Train set - loss: 0.066, accuracy: 96.503
            Validation  set - loss: 0.063, accuracy: 96.503
            
epoch 600
            Train set - loss: 0.065, accuracy: 96.503
            Validation  set - loss: 0.061, accuracy: 96.503
            
epoch 700
            Train set - loss: 0.064, accuracy: 96.503
            Validation  set - loss: 0.06, accuracy: 96.503


# threshold estimation
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(train_yt, model(train_Xt).detach())
tau = thresholds[np.argmax(tpr - fpr)]

# compute predictions from test set
pred_yt = sigmoid(y_val_pred.detach()) >= tau


conf_mat = confusion_matrix(y_true=val_yt, y_pred=pred_yt)


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# plot loss function
ax1.plot(range(len(loss_list)), loss_list)
ax1.set_title('Training Loss', fontsize=14)
ax1.set_xlabel('Epoch #', fontsize=14)
ax1.set_ylabel('$L(\mathbf{w})$', fontsize=14)

# print confusion matrix
ax2.matshow(conf_mat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        ax2.text(x=j, y=i, s=conf_mat[i, j], va='center', ha='center', size='xx-large')

ax2.set_title('Validation Confusion Matrix', fontsize=14)
ax2.set_xlabel('Predictions $\mathbf{\hat{y}}$', fontsize=14)
ax2.set_ylabel('Actuals $\mathbf{y}$', fontsize=14)
ax2.set_yticks([0, 1])
ax2.set_xticks([0, 1])
ax2.set_yticklabels(class_labels)
ax2.set_xticklabels(class_labels)
ax2.tick_params(top=False, bottom=True, labeltop=False, labelbottom=True)
plt.show()

Stochastic Gradient Descent (SGD)¶

Data acquisition¶

Gradient descent¶

Optimization¶

Mini-Batching¶

Implementation¶

Validation¶

Activation function¶

Loss animation¶

Overfitting validation¶

PyTorch equivalent¶