MNIST with keras

Jupyter notebook for this exercise is available for download here:

Tarred session dump (105M) of this Jupyter notebook is available for download here. That could save forkers’ run time and cloud budget. Dill dumps contain all variables and may be loaded in two lines:

import dill
dill.load_session('whatever_filename.db')

We will explore:

6 different architectures
3 different optimizers
varying training size
customised model summary
fit with callbacks

We begin by preparing 2 sets of train_X (train_X1 and train_X2) to meet different shape requirements of Dense and Conv2D nets.

from keras.datasets import mnist
from keras.utils import to_categorical
from keras import models, layers, callbacks
import time
import numpy as np
np.random.seed(77)

(train_X, train_y), (test_X, test_y) = mnist.load_data()
nclasses = np.unique(train_y).size

def shapex(X):
    X = X.astype('float32') / 255
    X1 = X.reshape(X.shape[0], np.prod(X.shape[1:]))
    X2 = X.reshape(*X.shape, 1) 
    return X1, X2

train_X1, train_X2 = shapex(train_X)
test_X1, test_X2 = shapex(test_X)
train_y = to_categorical(train_y)
test_y = to_categorical(test_y)

train_X.shape, train_X1.shape, train_X2.shape

Next is our own version of ‘model.summary()’. More informative. Model label is extracted directly from model attributes. To avoid labels running too long:

class names are abbreviated to their first 2 characters e.g. De for Dense, Ac for Activation
activations are abbreviated to their first 3 characters e.g. lin for linear, rel for relu

Example: label RMSprop:De512rel|De10sof says the model is 2-layered, and has optimizer=RMSprop; the first layer is Dense with 512 units and activation relu; the second layer is Dense with 10 units and activation softmax.

def summarisetis(model):
    s = '{}'.format(model.optimizer).split(' ')[0].split('.')[-1]
    print(s)
    print('{:12s}{:>10s}{:>10s}{:>10s}{:>10s}{:>10s}{:>10s}'.format('class', 'input', 'output', 'units', 'params', 'activ', 'label'))
    print('========================================================================')
    modellabel = s + ':'
    for nl, l in enumerate(model.layers):
        s = '{}'.format(l).split(' ')[0].split('.')[-1]
        print('{:12s}{:10d}'.format(s,l.input_shape[1]), end='')
        print('{:10d}'.format(l.output_shape[1]), end='')
        layerlabel = s[:2]
        try:
            layerlabel = f'{layerlabel}{l.units}'
            print('{:10d}'.format(l.units), end='')
        except:
            print('{:10s}'.format(''), end='')
        print('{:10d}'.format(l.count_params()), end='')
        try:
            s = '{}'.format(l.activation).split(' ')[1]
            layerlabel = layerlabel + s[:3]
            print('{:>10s}{:>10s}'.format(s, layerlabel))
        except:
            print('{:10s}{:>10s}'.format('', layerlabel))
        modellabel = modellabel + layerlabel
        if nl < len(model.layers)-1:
            modellabel = modellabel + '|'
    print('labelling this model as', modellabel,'\n')
    return modellabel

We explore 5 different architectures:

def arch(which):
    m = models.Sequential()
    if which==0:
        hu = 512
        m.add(layers.Dense(hu, activation='relu', input_shape=(train_X1.shape[1],)))
        m.add(layers.Dense(nclasses, activation='softmax'))
    elif which==1:
        m.add(layers.Dense(nclasses, input_shape=(train_X1.shape[1],)))
        m.add(layers.Activation('softmax'))
    elif which==2:
        hu = 128
        m.add(layers.Dense(hu, input_shape=(train_X1.shape[1],)))
        m.add(layers.Activation('relu'))
        m.add(layers.Dense(hu))
        m.add(layers.Activation('relu'))
        m.add(layers.Dense(nclasses))
        m.add(layers.Activation('softmax'))
    elif which==3:
        dropout = .3
        hu = 128
        m.add(layers.Dense(hu, input_shape=(train_X1.shape[1],)))
        m.add(layers.Activation('relu'))
        m.add(layers.Dropout(dropout))
        m.add(layers.Dense(hu))
        m.add(layers.Activation('relu'))
        m.add(layers.Dropout(dropout))
        m.add(layers.Dense(nclasses))
        m.add(layers.Activation('softmax'))
    elif which==4:
        m.add(layers.Conv2D(32, (3,3), activation='relu', input_shape=train_X2.shape[1:]))
        m.add(layers.MaxPooling2D((2,2)))
        m.add(layers.Conv2D(64, (3,3), activation='relu'))
        m.add(layers.MaxPooling2D((2,2)))
        m.add(layers.Conv2D(64, (3,3), activation='relu'))
        m.add(layers.Flatten())
        m.add(layers.Dense(64, activation='relu'))
        m.add(layers.Dense(nclasses, activation='softmax'))
    elif which==5:
        m.add(layers.Conv2D(20, (5,5), padding='same', input_shape=train_X2.shape[1:]))
        m.add(layers.Activation('relu'))
        m.add(layers.MaxPooling2D((2,2), strides=(2,2)))
        m.add(layers.Conv2D(50, (5,5), border_mode='same'))
        m.add(layers.Activation('relu'))
        m.add(layers.MaxPooling2D((2,2), strides=(2,2)))
        m.add(layers.Flatten())
        m.add(layers.Dense(500))
        m.add(layers.Activation('relu'))
        m.add(layers.Dense(nclasses))
        m.add(layers.Activation('softmax'))
    return m

def compiletis(model, op):
    model.compile(optimizer=op,
                      loss='categorical_crossentropy',
                   metrics=['accuracy'])
    return model

Ccompile each architecture with 3 different optimisers: adam, rmsprop and sgd:

model, modellabel = [], []
for nm in range(6):
    model.append(compiletis(arch(nm), 'adam'))
    model.append(compiletis(arch(nm), 'rmsprop'))
    model.append(compiletis(arch(nm), 'sgd'))

for nm, m in enumerate(model):
    print('model #{}: optimiser = '.format(nm), end='')
    modellabel.append(summarisetis(m))

We see the output:

model #0: optimiser = Adam 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       512       512    401920      relu  De512rel 
Dense              512        10        10      5130   softmax   
De10sof 
labelling this model as Adam:De512rel|De10sof  
model #1: optimiser = RMSprop 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       512       512    401920      relu  De512rel 
Dense              512        10        10      5130   softmax   De10sof 
labelling this model as RMSprop:De512rel|De10sof  
model #2: optimiser = SGD 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       512       512    401920      relu  De512rel 
Dense              512        10        10      5130   softmax   De10sof 
labelling this model as SGD:De512rel|De10sof  
model #3: optimiser = Adam 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784        10        10      7850    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as Adam:De10lin|Acsof  
model #4: optimiser = RMSprop 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784        10        10      7850    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as RMSprop:De10lin|Acsof  
model #5: optimiser = SGD 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784        10        10      7850    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as SGD:De10lin|Acsof  
model #6: optimiser = Adam 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       128       128    100480    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dense              128       128       128     16512    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dense              128        10        10      1290    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as Adam:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof  
model #7: optimiser = RMSprop 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       128       128    100480    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dense              128       128       128     16512    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dense              128        10        10      1290    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as RMSprop:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof  
model #8: optimiser = SGD 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       128       128    100480    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dense              128       128       128     16512    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dense              128        10        10      1290    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as SGD:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof  
model #9: optimiser = Adam 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       128       128    100480    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dropout            128       128                   0                  Dr 
Dense              128       128       128     16512    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dropout            128       128                   0                  Dr 
Dense              128        10        10      1290    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as Adam:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof  
model #10: optimiser = RMSprop 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       128       128    100480    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dropout            128       128                   0                  Dr 
Dense              128       128       128     16512    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dropout            128       128                   0                  Dr 
Dense              128        10        10      1290    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof  
model #11: optimiser = SGD 
class            input    output     units    params     activ     label 
======================================================================== 
Dense              784       128       128    100480    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dropout            128       128                   0                  Dr 
Dense              128       128       128     16512    linear  De128lin 
Activation         128       128                   0      relu     Acrel 
Dropout            128       128                   0                  Dr 
Dense              128        10        10      1290    linear   De10lin 
Activation          10        10                   0   softmax     Acsof 
labelling this model as SGD:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof  
model #12: optimiser = Adam 
class            input    output     units    params     activ     label 
======================================================================== 
Conv2D              28        26                 320      relu     Corel 
MaxPooling2D        26        13                   0                  Ma 
Conv2D              13        11               18496      relu     Corel 
MaxPooling2D        11         5                   0                  Ma 
Conv2D               5         3               36928      relu     Corel 
Flatten              3       576                   0                  Fl 
Dense              576        64        64     36928      relu   De64rel 
Dense               64        10        10       650   softmax   De10sof 
labelling this model as Adam:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof  
model #13: optimiser = RMSprop 
class            input    output     units    params     activ     label 
======================================================================== 
Conv2D              28        26                 320      relu     Corel 
MaxPooling2D        26        13                   0                  Ma 
Conv2D              13        11               18496      relu     Corel 
MaxPooling2D        11         5                   0                  Ma 
Conv2D               5         3               36928      relu     Corel 
Flatten              3       576                   0                  Fl 
Dense              576        64        64     36928      relu   De64rel 
Dense               64        10        10       650   softmax   De10sof 
labelling this model as RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof  
model #14: optimiser = SGD 
class            input    output     units    params     activ     label 
======================================================================== 
Conv2D              28        26                 320      relu     Corel 
MaxPooling2D        26        13                   0                  Ma 
Conv2D              13        11               18496      relu     Corel 
MaxPooling2D        11         5                   0                  Ma 
Conv2D               5         3               36928      relu     Corel 
Flatten              3       576                   0                  Fl 
Dense              576        64        64     36928      relu   De64rel 
Dense               64        10        10       650   softmax   De10sof 
labelling this model as SGD:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof

Let us define a function to make 2 subplots for each model fitting: losses vs epoch on the left, accuracy vs epoch on the right. Each subplot will have a line for training and another line for validation.

%matplotlib inline
import matplotlib.pyplot as plt
def plottis(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

plt.figure(figsize=(10, 3))
    plt.subplot(121)
    plt.plot(range(1, len(loss) + 1), loss, label='training')
    plt.plot(range(1, len(val_loss) + 1), val_loss, label='validation')
    plt.xlabel('epoch')
    plt.ylabel('loss')

plt.subplot(122)
    plt.plot(range(1, len(acc) + 1), acc, label='training')
    plt.plot(range(1, len(val_acc) + 1), val_acc, label='validation')
    plt.xlabel('epoch')
    plt.ylabel('accuracy')

We fit with callbacks asking Keras to shorten the number of epochs if validation loss stops decreasing.

cb = callbacks.EarlyStopping(monitor='val_loss',
                             min_delta=0,
                             patience=5,
                             verbose=0, mode='auto')
def fittis(model, bs, ep, train_X, train_y, test_X):
    tic = time.perf_counter()
    history = model.fit(train_X, train_y, 
                        epochs=ep, batch_size=bs, 
                        validation_split=.3, verbose=0,
                        callbacks = [cb], shuffle=False)
    plottis(history)
    train_acc = history.history['acc']
    train_los = history.history['loss']
    val_acc   = history.history['val_acc']
    val_los   = history.history['val_loss']
    iacc = 1+int(min(np.where(val_acc==max(val_acc))[0]))
    ilos = 1+int(min(np.where(val_los==min(val_los))[0]))
    model.fit(train_X, train_y, epochs=ilos, batch_size=bs, verbose=0, shuffle=False)
    _, test_acc = model.evaluate(test_X, test_y)
    tim = time.perf_counter()-tic
    print('train_acc = {:.3f}    val_acc = {:.3f}    epochs = {:3d}    test_acc = {:.3f}    time = {:.1e}'.format
           (max(train_acc), max(val_acc), ilos, test_acc, tim))
    myhistory = [train_acc, train_los, val_acc, val_los, iacc, ilos, test_acc, tim]
    return model, myhistory
myhistory = []
for nm, m in enumerate(model):
    print(modellabel[nm])
    s = '{}'.format(m.layers[0])
    if 'Conv2D' in s:
        m, h = fittis(m, 128, 100, train_X2, train_y, test_X2)
    else:
        m, h = fittis(m, 128, 100, train_X1, train_y, test_X1)
    myhistory.append(h)

We get the output:

Adam:De512rel|De10sof 
10000/10000 [==============================] - 0s 34us/step 
train_acc = 0.999    val_acc = 0.975    epochs =   7    test_acc = 0.977    time = 2.5e+01 
RMSprop:De512rel|De10sof 
10000/10000 [==============================] - 0s 35us/step 
train_acc = 0.998    val_acc = 0.973    epochs =   5    test_acc = 0.978    time = 1.7e+01 
SGD:De512rel|De10sof 
10000/10000 [==============================] - 0s 35us/step 
train_acc = 0.976    val_acc = 0.964    epochs = 100    test_acc = 0.978    time = 2.1e+02 
Adam:De10lin|Acsof 
10000/10000 [==============================] - 0s 33us/step 
train_acc = 0.933    val_acc = 0.924    epochs =  27    test_acc = 0.927    time = 6.3e+01
 RMSprop:De10lin|Acsof 
10000/10000 [==============================] - 0s 33us/step 
train_acc = 0.930    val_acc = 0.924    epochs =  19    test_acc = 0.927    time = 4.3e+01 
SGD:De10lin|Acsof 
10000/10000 [==============================] - 0s 33us/step 
train_acc = 0.917    val_acc = 0.915    epochs = 100    test_acc = 0.922    time = 1.9e+02 
Adam:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof 
10000/10000 [==============================] - 0s 35us/step 
train_acc = 0.995    val_acc = 0.970    epochs =   5    test_acc = 0.975    time = 2.0e+01 
RMSprop:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof 
10000/10000 [==============================] - 0s 36us/step 
train_acc = 0.996    val_acc = 0.971    epochs =   6    test_acc = 0.974    time = 2.0e+01 
SGD:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof 
10000/10000 [==============================] - 0s 36us/step 
train_acc = 0.986    val_acc = 0.968    epochs = 100    test_acc = 0.977    time = 2.2e+02 
Adam:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof 
10000/10000 [==============================] - 0s 40us/step 
train_acc = 0.981    val_acc = 0.974    epochs =  12    test_acc = 0.980    time = 4.3e+01 
RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof 
10000/10000 [==============================] - 0s 36us/step 
train_acc = 0.979    val_acc = 0.973    epochs =  10    test_acc = 0.981    time = 3.3e+01 
SGD:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof 
10000/10000 [==============================] - 0s 38us/step 
train_acc = 0.962    val_acc = 0.968    epochs = 100    test_acc = 0.979    time = 2.4e+02 
Adam:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof 
10000/10000 [==============================] - 1s 52us/step 
train_acc = 0.999    val_acc = 0.989    epochs =  14    test_acc = 0.991    time = 6.5e+01 
RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof 
10000/10000 [==============================] - 0s 50us/step 
train_acc = 0.996    val_acc = 0.987    epochs =   4    test_acc = 0.990    time = 2.3e+01 
SGD:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof 
10000/10000 [==============================] - 1s 51us/step 
train_acc = 0.994    val_acc = 0.982    epochs =  46    test_acc = 0.989    time = 1.6e+02

Judging by the highest test_acc, we find the best-performing architectures to be:

Colin|Acrel|Ma|Colin|Acrel|Ma|Fl|De500lin|Acrel|De10lin|Acsof
Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
De512rel|De10sof

As expected, SGD is slow. Overall (not taking the 4th significant figure too seriously), for the architectures tested here we find RMSprop most efficient.

Can we afford a smaller training set?

So far, we have been training with 60,000 data points. Can we afford any less? We pick the 3 models from above, and run each with reduced number of data points.

Varying train size with RMSprop:De10lin|Acsof

for sz in range(10000,60000,10000):
    model.append(compiletis(arch(1), 'rmsprop'))
    modellabel.append(summarisetis(model[-1]))
    print(modellabel[-1])
    m, h = fittis(model[-1], 128, 100, train_X1[:sz], train_y[:sz], test_X1)
    myhistory.append(h)

Among the output lines we get the following when sz=40,000:

RMSprop:De10lin|Acsof 
10000/10000 [==============================] - 0s 42us/step 
train_acc = 0.934    val_acc = 0.916    epochs =  21    test_acc = 0.928    time = 3.7e+01

Reducing training samples to 40,000 got us to test_acc=.928. So, we do need 60,000 training samples for this model. The model previously scored test_acc=.978 with 60,000 training samples.

Varying train size with RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof

for sz in range(10000,60000,10000):
    model.append(compiletis(arch(3), 'rmsprop'))
    modellabel.append(summarisetis(model[-1]))
    print(modellabel[-1])
    m, h = fittis(model[-1], 128, 100, train_X1[:sz], train_y[:sz], test_X1)
    myhistory.append(h)

Among the lines of output we find the following when sz=50,000:

RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof 
10000/10000 [==============================] - 0s 45us/step 
train_acc = 0.977    val_acc = 0.967    epochs =   9    test_acc = 0.978    time = 3.0e+01

Reducing training samples to 50,000 got us to test_acc=.978, which is not far from the previous score of test_acc=.981 when all 60,000 samples were used for training.

Varying train size with RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof

for sz in range(10000,60000,10000):
    model.append(compiletis(arch(4), 'rmsprop'))
    modellabel.append(summarisetis(model[-1]))
    print(modellabel[-1])
    m, h = fittis(model[-1], 128, 100, train_X2[:sz], train_y[:sz], test_X2)
    myhistory.append(h)

Among the lines of output we find the following when sz=40,000:

RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof 
10000/10000 [==============================] - 1s 57us/step 
train_acc = 0.996    val_acc = 0.984    epochs =   3    test_acc = 0.990    time = 1.5e+01

Reducing training samples to 40,000 got us to test_acc=.990, which is the same as the previous score of test_acc=.990 when all 60,000 samples were used for training. So we can afford to have just 40, 000 training
samples.

Learning!

Can we afford a smaller training set?

Varying train size with RMSprop:De10lin|Acsof

Varying train size with RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof

Varying train size with RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof

Leave a Reply Cancel reply

Metric: mutual info

Metric: silhoutte score

Metrics: homogeneity score, completeness score, v measure

Metric: Fowlkes-Mallows score

Metric: entropy

Metric: Davies-Bouldin index

Metric: Calinski-Harabasz index

Metric: adjusted rand score