Jupyter notebook for this exercise is available for download here:
Tarred session dump (105M) of this Jupyter notebook is available for download here. That could save forkers’ run time and cloud budget. Dill dumps contain all variables and may be loaded in two lines:
import dill
dill.load_session('whatever_filename.db')
We will explore:
- 6 different architectures
- 3 different optimizers
- varying training size
- customised model summary
- fit with callbacks
We begin by preparing 2 sets of train_X (train_X1 and train_X2) to meet different shape requirements of Dense and Conv2D nets.
from keras.datasets import mnist
from keras.utils import to_categorical
from keras import models, layers, callbacks
import time
import numpy as np
np.random.seed(77)
(train_X, train_y), (test_X, test_y) = mnist.load_data()
nclasses = np.unique(train_y).size
def shapex(X):
X = X.astype('float32') / 255
X1 = X.reshape(X.shape[0], np.prod(X.shape[1:]))
X2 = X.reshape(*X.shape, 1)
return X1, X2
train_X1, train_X2 = shapex(train_X)
test_X1, test_X2 = shapex(test_X)
train_y = to_categorical(train_y)
test_y = to_categorical(test_y)
train_X.shape, train_X1.shape, train_X2.shape
Next is our own version of ‘model.summary()’. More informative. Model label is extracted directly from model attributes. To avoid labels running too long:
- class names are abbreviated to their first 2 characters e.g. De for Dense, Ac for Activation
- activations are abbreviated to their first 3 characters e.g. lin for linear, rel for relu
Example: label RMSprop:De512rel|De10sof says the model is 2-layered, and has optimizer=RMSprop; the first layer is Dense with 512 units and activation relu; the second layer is Dense with 10 units and activation softmax.
def summarisetis(model):
s = '{}'.format(model.optimizer).split(' ')[0].split('.')[-1]
print(s)
print('{:12s}{:>10s}{:>10s}{:>10s}{:>10s}{:>10s}{:>10s}'.format('class', 'input', 'output', 'units', 'params', 'activ', 'label'))
print('========================================================================')
modellabel = s + ':'
for nl, l in enumerate(model.layers):
s = '{}'.format(l).split(' ')[0].split('.')[-1]
print('{:12s}{:10d}'.format(s,l.input_shape[1]), end='')
print('{:10d}'.format(l.output_shape[1]), end='')
layerlabel = s[:2]
try:
layerlabel = f'{layerlabel}{l.units}'
print('{:10d}'.format(l.units), end='')
except:
print('{:10s}'.format(''), end='')
print('{:10d}'.format(l.count_params()), end='')
try:
s = '{}'.format(l.activation).split(' ')[1]
layerlabel = layerlabel + s[:3]
print('{:>10s}{:>10s}'.format(s, layerlabel))
except:
print('{:10s}{:>10s}'.format('', layerlabel))
modellabel = modellabel + layerlabel
if nl < len(model.layers)-1:
modellabel = modellabel + '|'
print('labelling this model as', modellabel,'\n')
return modellabel
We explore 5 different architectures:
def arch(which):
m = models.Sequential()
if which==0:
hu = 512
m.add(layers.Dense(hu, activation='relu', input_shape=(train_X1.shape[1],)))
m.add(layers.Dense(nclasses, activation='softmax'))
elif which==1:
m.add(layers.Dense(nclasses, input_shape=(train_X1.shape[1],)))
m.add(layers.Activation('softmax'))
elif which==2:
hu = 128
m.add(layers.Dense(hu, input_shape=(train_X1.shape[1],)))
m.add(layers.Activation('relu'))
m.add(layers.Dense(hu))
m.add(layers.Activation('relu'))
m.add(layers.Dense(nclasses))
m.add(layers.Activation('softmax'))
elif which==3:
dropout = .3
hu = 128
m.add(layers.Dense(hu, input_shape=(train_X1.shape[1],)))
m.add(layers.Activation('relu'))
m.add(layers.Dropout(dropout))
m.add(layers.Dense(hu))
m.add(layers.Activation('relu'))
m.add(layers.Dropout(dropout))
m.add(layers.Dense(nclasses))
m.add(layers.Activation('softmax'))
elif which==4:
m.add(layers.Conv2D(32, (3,3), activation='relu', input_shape=train_X2.shape[1:]))
m.add(layers.MaxPooling2D((2,2)))
m.add(layers.Conv2D(64, (3,3), activation='relu'))
m.add(layers.MaxPooling2D((2,2)))
m.add(layers.Conv2D(64, (3,3), activation='relu'))
m.add(layers.Flatten())
m.add(layers.Dense(64, activation='relu'))
m.add(layers.Dense(nclasses, activation='softmax'))
elif which==5:
m.add(layers.Conv2D(20, (5,5), padding='same', input_shape=train_X2.shape[1:]))
m.add(layers.Activation('relu'))
m.add(layers.MaxPooling2D((2,2), strides=(2,2)))
m.add(layers.Conv2D(50, (5,5), border_mode='same'))
m.add(layers.Activation('relu'))
m.add(layers.MaxPooling2D((2,2), strides=(2,2)))
m.add(layers.Flatten())
m.add(layers.Dense(500))
m.add(layers.Activation('relu'))
m.add(layers.Dense(nclasses))
m.add(layers.Activation('softmax'))
return m
def compiletis(model, op):
model.compile(optimizer=op,
loss='categorical_crossentropy',
metrics=['accuracy'])
return model
Ccompile each architecture with 3 different optimisers: adam, rmsprop and sgd:
model, modellabel = [], []
for nm in range(6):
model.append(compiletis(arch(nm), 'adam'))
model.append(compiletis(arch(nm), 'rmsprop'))
model.append(compiletis(arch(nm), 'sgd'))
for nm, m in enumerate(model):
print('model #{}: optimiser = '.format(nm), end='')
modellabel.append(summarisetis(m))
We see the output:
model #0: optimiser = Adam
class input output units params activ label
========================================================================
Dense 784 512 512 401920 relu De512rel
Dense 512 10 10 5130 softmax
De10sof
labelling this model as Adam:De512rel|De10sof
model #1: optimiser = RMSprop
class input output units params activ label
========================================================================
Dense 784 512 512 401920 relu De512rel
Dense 512 10 10 5130 softmax De10sof
labelling this model as RMSprop:De512rel|De10sof
model #2: optimiser = SGD
class input output units params activ label
========================================================================
Dense 784 512 512 401920 relu De512rel
Dense 512 10 10 5130 softmax De10sof
labelling this model as SGD:De512rel|De10sof
model #3: optimiser = Adam
class input output units params activ label
========================================================================
Dense 784 10 10 7850 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as Adam:De10lin|Acsof
model #4: optimiser = RMSprop
class input output units params activ label
========================================================================
Dense 784 10 10 7850 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as RMSprop:De10lin|Acsof
model #5: optimiser = SGD
class input output units params activ label
========================================================================
Dense 784 10 10 7850 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as SGD:De10lin|Acsof
model #6: optimiser = Adam
class input output units params activ label
========================================================================
Dense 784 128 128 100480 linear De128lin
Activation 128 128 0 relu Acrel
Dense 128 128 128 16512 linear De128lin
Activation 128 128 0 relu Acrel
Dense 128 10 10 1290 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as Adam:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof
model #7: optimiser = RMSprop
class input output units params activ label
========================================================================
Dense 784 128 128 100480 linear De128lin
Activation 128 128 0 relu Acrel
Dense 128 128 128 16512 linear De128lin
Activation 128 128 0 relu Acrel
Dense 128 10 10 1290 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as RMSprop:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof
model #8: optimiser = SGD
class input output units params activ label
========================================================================
Dense 784 128 128 100480 linear De128lin
Activation 128 128 0 relu Acrel
Dense 128 128 128 16512 linear De128lin
Activation 128 128 0 relu Acrel
Dense 128 10 10 1290 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as SGD:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof
model #9: optimiser = Adam
class input output units params activ label
========================================================================
Dense 784 128 128 100480 linear De128lin
Activation 128 128 0 relu Acrel
Dropout 128 128 0 Dr
Dense 128 128 128 16512 linear De128lin
Activation 128 128 0 relu Acrel
Dropout 128 128 0 Dr
Dense 128 10 10 1290 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as Adam:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
model #10: optimiser = RMSprop
class input output units params activ label
========================================================================
Dense 784 128 128 100480 linear De128lin
Activation 128 128 0 relu Acrel
Dropout 128 128 0 Dr
Dense 128 128 128 16512 linear De128lin
Activation 128 128 0 relu Acrel
Dropout 128 128 0 Dr
Dense 128 10 10 1290 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
model #11: optimiser = SGD
class input output units params activ label
========================================================================
Dense 784 128 128 100480 linear De128lin
Activation 128 128 0 relu Acrel
Dropout 128 128 0 Dr
Dense 128 128 128 16512 linear De128lin
Activation 128 128 0 relu Acrel
Dropout 128 128 0 Dr
Dense 128 10 10 1290 linear De10lin
Activation 10 10 0 softmax Acsof
labelling this model as SGD:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
model #12: optimiser = Adam
class input output units params activ label
========================================================================
Conv2D 28 26 320 relu Corel
MaxPooling2D 26 13 0 Ma
Conv2D 13 11 18496 relu Corel
MaxPooling2D 11 5 0 Ma
Conv2D 5 3 36928 relu Corel
Flatten 3 576 0 Fl
Dense 576 64 64 36928 relu De64rel
Dense 64 10 10 650 softmax De10sof
labelling this model as Adam:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
model #13: optimiser = RMSprop
class input output units params activ label
========================================================================
Conv2D 28 26 320 relu Corel
MaxPooling2D 26 13 0 Ma
Conv2D 13 11 18496 relu Corel
MaxPooling2D 11 5 0 Ma
Conv2D 5 3 36928 relu Corel
Flatten 3 576 0 Fl
Dense 576 64 64 36928 relu De64rel
Dense 64 10 10 650 softmax De10sof
labelling this model as RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
model #14: optimiser = SGD
class input output units params activ label
========================================================================
Conv2D 28 26 320 relu Corel
MaxPooling2D 26 13 0 Ma
Conv2D 13 11 18496 relu Corel
MaxPooling2D 11 5 0 Ma
Conv2D 5 3 36928 relu Corel
Flatten 3 576 0 Fl
Dense 576 64 64 36928 relu De64rel
Dense 64 10 10 650 softmax De10sof
labelling this model as SGD:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
Let us define a function to make 2 subplots for each model fitting: losses vs epoch on the left, accuracy vs epoch on the right. Each subplot will have a line for training and another line for validation.
%matplotlib inline
import matplotlib.pyplot as plt
def plottis(history):
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.figure(figsize=(10, 3))
plt.subplot(121)
plt.plot(range(1, len(loss) + 1), loss, label='training')
plt.plot(range(1, len(val_loss) + 1), val_loss, label='validation')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.subplot(122)
plt.plot(range(1, len(acc) + 1), acc, label='training')
plt.plot(range(1, len(val_acc) + 1), val_acc, label='validation')
plt.xlabel('epoch')
plt.ylabel('accuracy')
We fit with callbacks asking Keras to shorten the number of epochs if validation loss stops decreasing.
cb = callbacks.EarlyStopping(monitor='val_loss',
min_delta=0,
patience=5,
verbose=0, mode='auto')
def fittis(model, bs, ep, train_X, train_y, test_X):
tic = time.perf_counter()
history = model.fit(train_X, train_y,
epochs=ep, batch_size=bs,
validation_split=.3, verbose=0,
callbacks = [cb], shuffle=False)
plottis(history)
train_acc = history.history['acc']
train_los = history.history['loss']
val_acc = history.history['val_acc']
val_los = history.history['val_loss']
iacc = 1+int(min(np.where(val_acc==max(val_acc))[0]))
ilos = 1+int(min(np.where(val_los==min(val_los))[0]))
model.fit(train_X, train_y, epochs=ilos, batch_size=bs, verbose=0, shuffle=False)
_, test_acc = model.evaluate(test_X, test_y)
tim = time.perf_counter()-tic
print('train_acc = {:.3f} val_acc = {:.3f} epochs = {:3d} test_acc = {:.3f} time = {:.1e}'.format
(max(train_acc), max(val_acc), ilos, test_acc, tim))
myhistory = [train_acc, train_los, val_acc, val_los, iacc, ilos, test_acc, tim]
return model, myhistory
myhistory = []
for nm, m in enumerate(model):
print(modellabel[nm])
s = '{}'.format(m.layers[0])
if 'Conv2D' in s:
m, h = fittis(m, 128, 100, train_X2, train_y, test_X2)
else:
m, h = fittis(m, 128, 100, train_X1, train_y, test_X1)
myhistory.append(h)
We get the output:
Adam:De512rel|De10sof
10000/10000 [==============================] - 0s 34us/step
train_acc = 0.999 val_acc = 0.975 epochs = 7 test_acc = 0.977 time = 2.5e+01
RMSprop:De512rel|De10sof
10000/10000 [==============================] - 0s 35us/step
train_acc = 0.998 val_acc = 0.973 epochs = 5 test_acc = 0.978 time = 1.7e+01
SGD:De512rel|De10sof
10000/10000 [==============================] - 0s 35us/step
train_acc = 0.976 val_acc = 0.964 epochs = 100 test_acc = 0.978 time = 2.1e+02
Adam:De10lin|Acsof
10000/10000 [==============================] - 0s 33us/step
train_acc = 0.933 val_acc = 0.924 epochs = 27 test_acc = 0.927 time = 6.3e+01
RMSprop:De10lin|Acsof
10000/10000 [==============================] - 0s 33us/step
train_acc = 0.930 val_acc = 0.924 epochs = 19 test_acc = 0.927 time = 4.3e+01
SGD:De10lin|Acsof
10000/10000 [==============================] - 0s 33us/step
train_acc = 0.917 val_acc = 0.915 epochs = 100 test_acc = 0.922 time = 1.9e+02
Adam:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof
10000/10000 [==============================] - 0s 35us/step
train_acc = 0.995 val_acc = 0.970 epochs = 5 test_acc = 0.975 time = 2.0e+01
RMSprop:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof
10000/10000 [==============================] - 0s 36us/step
train_acc = 0.996 val_acc = 0.971 epochs = 6 test_acc = 0.974 time = 2.0e+01
SGD:De128lin|Acrel|De128lin|Acrel|De10lin|Acsof
10000/10000 [==============================] - 0s 36us/step
train_acc = 0.986 val_acc = 0.968 epochs = 100 test_acc = 0.977 time = 2.2e+02
Adam:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
10000/10000 [==============================] - 0s 40us/step
train_acc = 0.981 val_acc = 0.974 epochs = 12 test_acc = 0.980 time = 4.3e+01
RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
10000/10000 [==============================] - 0s 36us/step
train_acc = 0.979 val_acc = 0.973 epochs = 10 test_acc = 0.981 time = 3.3e+01
SGD:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
10000/10000 [==============================] - 0s 38us/step
train_acc = 0.962 val_acc = 0.968 epochs = 100 test_acc = 0.979 time = 2.4e+02
Adam:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
10000/10000 [==============================] - 1s 52us/step
train_acc = 0.999 val_acc = 0.989 epochs = 14 test_acc = 0.991 time = 6.5e+01
RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
10000/10000 [==============================] - 0s 50us/step
train_acc = 0.996 val_acc = 0.987 epochs = 4 test_acc = 0.990 time = 2.3e+01
SGD:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
10000/10000 [==============================] - 1s 51us/step
train_acc = 0.994 val_acc = 0.982 epochs = 46 test_acc = 0.989 time = 1.6e+02
Judging by the highest test_acc, we find the best-performing architectures to be:
Colin|Acrel|Ma|Colin|Acrel|Ma|Fl|De500lin|Acrel|De10lin|Acsof
Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
De512rel|De10sof
As expected, SGD is slow. Overall (not taking the 4th significant figure too seriously), for the architectures tested here we find RMSprop most efficient.
Can we afford a smaller training set?
So far, we have been training with 60,000 data points. Can we afford any less? We pick the 3 models from above, and run each with reduced number of data points.
Varying train size with RMSprop:De10lin|Acsof
for sz in range(10000,60000,10000):
model.append(compiletis(arch(1), 'rmsprop'))
modellabel.append(summarisetis(model[-1]))
print(modellabel[-1])
m, h = fittis(model[-1], 128, 100, train_X1[:sz], train_y[:sz], test_X1)
myhistory.append(h)
Among the output lines we get the following when sz=40,000:
RMSprop:De10lin|Acsof
10000/10000 [==============================] - 0s 42us/step
train_acc = 0.934 val_acc = 0.916 epochs = 21 test_acc = 0.928 time = 3.7e+01
Reducing training samples to 40,000 got us to test_acc=.928. So, we do need 60,000 training samples for this model. The model previously scored test_acc=.978 with 60,000 training samples.
Varying train size with RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
for sz in range(10000,60000,10000):
model.append(compiletis(arch(3), 'rmsprop'))
modellabel.append(summarisetis(model[-1]))
print(modellabel[-1])
m, h = fittis(model[-1], 128, 100, train_X1[:sz], train_y[:sz], test_X1)
myhistory.append(h)
Among the lines of output we find the following when sz=50,000:
RMSprop:De128lin|Acrel|Dr|De128lin|Acrel|Dr|De10lin|Acsof
10000/10000 [==============================] - 0s 45us/step
train_acc = 0.977 val_acc = 0.967 epochs = 9 test_acc = 0.978 time = 3.0e+01
Reducing training samples to 50,000 got us to test_acc=.978, which is not far from the previous score of test_acc=.981 when all 60,000 samples were used for training.
Varying train size with RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
for sz in range(10000,60000,10000):
model.append(compiletis(arch(4), 'rmsprop'))
modellabel.append(summarisetis(model[-1]))
print(modellabel[-1])
m, h = fittis(model[-1], 128, 100, train_X2[:sz], train_y[:sz], test_X2)
myhistory.append(h)
Among the lines of output we find the following when sz=40,000:
RMSprop:Corel|Ma|Corel|Ma|Corel|Fl|De64rel|De10sof
10000/10000 [==============================] - 1s 57us/step
train_acc = 0.996 val_acc = 0.984 epochs = 3 test_acc = 0.990 time = 1.5e+01
Reducing training samples to 40,000 got us to test_acc=.990, which is the same as the previous score of test_acc=.990 when all 60,000 samples were used for training. So we can afford to have just 40, 000 training
samples.