Classifying MNIST images with deep CNN using Tensorflow

By Michael Eryan

This is my practice to classify the classic MNIST handwritten digits data using TF's Layers API.

In [1]:
# In[]: Import libraries
import time
import numpy as np
import scipy.signal
import scipy.misc
import math #for ceil fn
import gzip
import struct

# Load and examine python and TF
import sys
print ("\n Python version is:", sys.version_info, "\n")

import tensorflow as tf

from tensorflow.python.client import device_lib
print ("\n Local devices: \n")
print (device_lib.list_local_devices() )

print ("\n GPU device name: \n")
print (tf.test.gpu_device_name() )
 Python version is: sys.version_info(major=3, minor=6, micro=2, releaselevel='final', serial=0) 


 Local devices: 

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6467853967938227808
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1469952819
locality {
  bus_id: 1
}
incarnation: 10513650714819437445
physical_device_desc: "device: 0, name: GeForce GTX 960, pci bus id: 0000:01:00.0, compute capability: 5.2"
]

 GPU device name: 

/device:GPU:0
In [2]:
# In[]:
import os
os.chdir(r"C:\Users\rf\Google Drive\Education\Python\Codes\PML\data") # no last slash

try:
    img = scipy.misc.imread('./example-image.png', mode='RGB')
except AttributeError:
    s = ("scipy.misc.imread requires Python's image library PIL"
         " You can satisfy this requirement by installing the"
         " userfriendly fork PILLOW via `pip install pillow`.")
    raise AttributeError(s)
    
print('Image shape:', img.shape)
print('Number of channels:', img.shape[2])
print('Image data type:', img.dtype)

print(img[100:102, 100:102, :])
Image shape: (252, 221, 3)
Number of channels: 3
Image data type: uint8
[[[179 134 110]
  [182 136 112]]

 [[180 135 111]
  [182 137 113]]]
In [6]:
# In[]:
# # Implementing a deep convolutional neural network using TensorFlow
# ## The multilayer CNN architecture 
# ## Loading and preprocessing the data
## unzips mnist

zipped_mnist = [f for f in os.listdir('./')
                if f.endswith('ubyte.gz')]
for z in zipped_mnist:
    with gzip.GzipFile(z, mode='rb') as decompressed, open(z[:-3], 'wb') as outfile:
        outfile.write(decompressed.read())

def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte'
                                % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte'
                               % kind)

    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II',
                                 lbpath.read(8))
        labels = np.fromfile(lbpath,
                             dtype=np.uint8)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack(">IIII",
                                               imgpath.read(16))
        images = np.fromfile(imgpath,
                             dtype=np.uint8).reshape(len(labels), 784)

    return images, labels

X_data, y_data = load_mnist('./', kind='train')
print('Rows: %d,  Columns: %d' % (X_data.shape[0], X_data.shape[1]))
X_test, y_test = load_mnist('./', kind='t10k')
print('Rows: %d,  Columns: %d' % (X_test.shape[0], X_test.shape[1]))

X_train, y_train = X_data[:50000,:], y_data[:50000]
X_valid, y_valid = X_data[50000:,:], y_data[50000:]

print('Training:   ', X_train.shape, y_train.shape)
print('Validation: ', X_valid.shape, y_valid.shape)
print('Test Set:   ', X_test.shape, y_test.shape)
Rows: 60000,  Columns: 784
Rows: 10000,  Columns: 784
Training:    (50000, 784) (50000,)
Validation:  (10000, 784) (10000,)
Test Set:    (10000, 784) (10000,)
In [7]:
# In[]:
#  How large of a batch can I fit into my GPU memory? time to upgrade yet?
def batch_generator(X, y, batch_size=64, shuffle=False, batch_seed=None):
    
    idx = np.arange(y.shape[0])
    
    if shuffle:
        rng = np.random.RandomState(batch_seed) #must be a differnt number for each epoch
        rng.shuffle(idx) #reshuffles the indexes (row numbers)
        X = X[idx] #returns the data in the new order
        y = y[idx]
    
    for i in range(0, X.shape[0], batch_size):
        yield (X[i:i+batch_size, :], y[i:i+batch_size]) #yield the data sizes of batch_size

#batch_gen =  batch_generator(X_train_centered, y_train, shuffle=True) 
#print (type(batch_gen)) #class generator - no data in it, need to call from a loop with enumerate
#print (enumerate(batch_gen))

mean_vals = np.mean(X_train, axis=0)
std_val = np.std(X_train)

X_train_centered = (X_train - mean_vals)/std_val
X_valid_centered = (X_valid - mean_vals)/std_val
X_test_centered = (X_test - mean_vals)/std_val

#free memory
del X_data, y_data, X_train, X_valid, X_test
In [8]:
# In[]: 
# ## Implementing a CNN in the TensorFlow layers API
# Moved some hyper parameter definition to the Session call
learning_rate=1e-4
dropout_rate=0.5
shuffle=True
graph_seed=1 #for initial weights, so can start at the same values in a new session
validation_set =1 #to score
    
np.random.seed(graph_seed) # for vars/weights
In [9]:
# In[]:
#can give custom names only to layers
#define the graph
g = tf.Graph()
with g.as_default():
    ## set random-seed: but for whom? the graph-level seed - To make the random sequences generated by all ops be repeatable across sessions
    tf.set_random_seed(graph_seed) #for starting variables/weights then, so we can compare
    
        ## build the network:
    
    ## Placeholders for X and y: - do not rename
    tf_x = tf.placeholder(tf.float32, 
                          shape=[None, 784],
                          name='tf_x')
    tf_y = tf.placeholder(tf.int32, 
                          shape=[None],
                          name='tf_y')
    is_train = tf.placeholder(tf.bool, 
                          shape=(),
                          name='is_train')
    
    ## reshape x to a 4D tensor: 
    ##  [batchsize, width, height, 1]
    tf_x_image = tf.reshape(tf_x, shape=[-1, 28, 28, 1],
                          name='Input_x_2dimages')
    print('tf_x_image:',tf_x_image.get_shape())
    
    ## One-hot encoding:
    tf_y_onehot = tf.one_hot(indices=tf_y, depth=10,
                          dtype=tf.float32,
                          name='Input_y_onehot')
    
    ## 1st layer: Conv_1 - for every obs produces per 32 obs - so, 5x5 with padding=VALID - becomes 24x24 images
    h1 = tf.layers.conv2d(tf_x_image, 
                          kernel_size=(5, 5), 
                          filters=32, 
                          activation=tf.nn.relu,
                          padding='VALID', # VALID is default, no 0 padding, same would give 28x28
                          name='Conv2d_h1')
    print('h1:',h1.get_shape())
    
    ## MaxPooling - halves the images to 12x12, 32 per obs
    h1_pool = tf.layers.max_pooling2d(h1, 
                          pool_size=(2, 2), 
                          strides=(2, 2))
    print('h1_pool:',h1_pool.get_shape())
    
    ## 2n layer: Conv_2 - 8x8 images - also padding=VALID
    ## Saw in Tensorboard - Kernel here is 5x5x32x64 - so, 32 to is like multiple channels that are added together
    ## so output is 64 such "summed" conv results 
    h2 = tf.layers.conv2d(h1_pool, kernel_size=(5,5), 
                          filters=64, #can be any number, how does it multiply?
                          activation=tf.nn.relu,
                          name='Conv2d_h2')
    print('h2:',h2.get_shape())
    
    ## MaxPooling - halves the images, 4x4, 64 per obs
    h2_pool = tf.layers.max_pooling2d(h2, 
                          pool_size=(2, 2), 
                          strides=(2, 2))
    print('h2_pool:',h2_pool.get_shape())
    
    ## 3rd layer: Fully Connected - 4x4x64 flattend to 1024 features/pixels per obs 
    input_shape = h2_pool.get_shape().as_list()
    n_input_units = np.prod(input_shape[1:])
    h2_pool_flat = tf.reshape(h2_pool, 
                          shape=[-1, n_input_units], 
                          name='Flatten')
    print('h2_pool_flat:',h2_pool_flat.get_shape())
    
    ## a relu activation on 1024 features per obs - yes, 1024x1024 - 1024 wgts in each 1024 activation fns
    h3 = tf.layers.dense(h2_pool_flat, 1024, 
                          activation=tf.nn.relu, 
                          name='Activations')
    print('h3:',h3.get_shape())
    
    ## Dropout - 1024 activations per obs are now treated as weights which are regularized to prevent overfitting
    h3_drop = tf.layers.dropout(h3, 
                          rate=dropout_rate,
                          training=is_train)
    print('h3_drop:',h3_drop.get_shape())
    
    ## 4th layer: Fully Connected (linear activation) - 10 y-dummies per obs now x 1024 features
    h4 = tf.layers.dense(h3_drop, 10, 
                          activation=None, 
                          name="Output_layer") #shows as two dense_ cells TB?
    print('h4:',h4.get_shape())

    ## Prediction
    predictions = {
        'probabilities': tf.nn.softmax(h4, 
                          name='probabilities'),
        'labels': tf.cast(tf.argmax(h4, axis=1), 
                          tf.int32, name='labels')}
    
    ## Loss Function and Optimization
    cross_entropy_loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=h4, labels=tf_y_onehot),
                name='cross_entropy_loss') #do not rename
    
    ## Optimizer:
    optimizer = tf.train.AdamOptimizer(learning_rate)
    optimizer = optimizer.minimize(cross_entropy_loss, name='train_op')
    
    ## Finding accuracy
    correct_predictions = tf.equal(
        predictions['labels'], 
        tf_y, name='aorrect_preds')
    
    accuracy = tf.reduce_mean(
        tf.cast(correct_predictions, tf.float32),
        name='accuracy')

        ## define the initializer and saver after the network

    ## initializer
    init_op = tf.global_variables_initializer()

    ## saver 
    saver = tf.train.Saver()
tf_x_image: (?, 28, 28, 1)
h1: (?, 24, 24, 32)
h1_pool: (?, 12, 12, 32)
h2: (?, 8, 8, 64)
h2_pool: (?, 4, 4, 64)
h2_pool_flat: (?, 1024)
h3: (?, 1024)
h3_drop: (?, 1024)
h4: (?, 10)
In [10]:
# In[32]: #start training
layer_start = time.time()

#Define hyper parameters 
epochs=20
batchsize=3500 #pretty large

with tf.Session(graph=g) as sess:    
    sess.run(init_op)

    train_cost_ = []
    X_data = X_train_centered
    y_data = y_train
    X_valid = X_valid_centered
    y_valid = y_valid
    
    # main loop to go through batches
    for epoch in range(1, epochs + 1):
        batch_gen =  batch_generator(X_data, y_data, batch_size=batchsize, shuffle=shuffle, batch_seed=None) 
            #batches have numbers, each epoch needs new random seed - so each run is different 
            #if seed=1, then epoch performance is the same, 2 reasons: tf.set_random_seed sets the starting weights the same, and batch seed create the same batches every time
        avg_loss = 0.0
        
        for i, (batch_x,batch_y) in enumerate(batch_gen):
            feed = {'tf_x:0': batch_x, 
                    'tf_y:0': batch_y,
                    'is_train:0': True} ## for dropout
            loss, _ = sess.run(['cross_entropy_loss:0', 'train_op'], 
                    feed_dict=feed) #this is the part that actually feeds the data into the cnn
            avg_loss += loss
            
        print('Epoch %02d: Training Avg. Loss: ''%7.3f' % (epoch, avg_loss), end=' ')

        #will have to do accuracy in batches too - and then just average the accuracy, not really OK, but fine.
        if validation_set is not None:
            valid_accuracy_sum =0
            batch_num = math.ceil(len(y_valid) / batchsize) # to avg the accuracies, need the ceiling fn
            
            batch_gen_v =  batch_generator(X_valid, y_valid, batch_size=batchsize, shuffle=False) 
            for i, (batch_x,batch_y) in enumerate(batch_gen_v):
                feed = {'tf_x:0': batch_x, 
                        'tf_y:0': batch_y,
                        'is_train:0': False} ## for dropout
                valid_acc = sess.run('accuracy:0', feed_dict=feed)
                valid_accuracy_sum += valid_acc
                valid_accuracy = valid_accuracy_sum / batch_num # verbose but more intuitive
            print('Validation Acc: %7.3f' % valid_accuracy)
        else:
            print()        

    pred_probs =[]
    pred_labels =[]
    #batch_gen =  batch_generator(X_test_centered, y_test, batch_size=batchsize, shuffle=False) 
    batch_gen =  batch_generator(X_test_centered, y_test, batch_size=batchsize, shuffle=False) 
    for i, (batch_x,batch_y) in enumerate(batch_gen): #do not need y
        feed = {'tf_x:0': batch_x, 'is_train:0': False}         
        probs = sess.run('probabilities:0', feed_dict=feed)
        labels = sess.run('labels:0', feed_dict=feed) 
        pred_probs.extend(probs) #only extend worked as I wanted
        pred_labels.extend(labels)

#need to understand TF sessions better - cannot restore session properly to score the test data set
        
layer_end = time.time()

layer_time = round(layer_end - layer_start) 
print ( "\n Layers complete time after",epochs,"epochs:", layer_time, " secs or", round(layer_time / 60.) , "mins \n") 

#10 epochs at max batchsize=3500, 17 secs - 3 secs per epoch!
#so, increasing batch size reduces training time for each each epoch

print('Test data set Accuracy: %.2f%%' % ( 100* np.sum(y_test == pred_labels)/len(y_test)))
Epoch 01: Training Avg. Loss:  30.366 Validation Acc:   0.804
Epoch 02: Training Avg. Loss:  19.095 Validation Acc:   0.868
Epoch 03: Training Avg. Loss:  10.458 Validation Acc:   0.899
Epoch 04: Training Avg. Loss:   6.913 Validation Acc:   0.921
Epoch 05: Training Avg. Loss:   5.338 Validation Acc:   0.933
Epoch 06: Training Avg. Loss:   4.501 Validation Acc:   0.943
Epoch 07: Training Avg. Loss:   3.895 Validation Acc:   0.951
Epoch 08: Training Avg. Loss:   3.424 Validation Acc:   0.957
Epoch 09: Training Avg. Loss:   3.074 Validation Acc:   0.961
Epoch 10: Training Avg. Loss:   2.769 Validation Acc:   0.965
Epoch 11: Training Avg. Loss:   2.521 Validation Acc:   0.968
Epoch 12: Training Avg. Loss:   2.308 Validation Acc:   0.970
Epoch 13: Training Avg. Loss:   2.139 Validation Acc:   0.972
Epoch 14: Training Avg. Loss:   1.979 Validation Acc:   0.974
Epoch 15: Training Avg. Loss:   1.834 Validation Acc:   0.975
Epoch 16: Training Avg. Loss:   1.737 Validation Acc:   0.976
Epoch 17: Training Avg. Loss:   1.626 Validation Acc:   0.976
Epoch 18: Training Avg. Loss:   1.537 Validation Acc:   0.977
Epoch 19: Training Avg. Loss:   1.458 Validation Acc:   0.979
Epoch 20: Training Avg. Loss:   1.377 Validation Acc:   0.979

 Layers complete time after 20 epochs: 59  secs or 1 mins 

Test data set Accuracy: 98.06%
In [11]:
# In[32]: Predictions from the test data set
print ("\nActual labels:")        
print (y_test[:10]) #first 10 predicted classes
print ("\nPredicted labels:") 
print (pred_labels[:10]) #first 10 predicted classes
print ("\nPrediction probabilities:") 
print (np.max(pred_probs[:10],axis=0)) #probs for the first obs
#OK, makes sense
Actual labels:
[7 2 1 0 4 1 4 9 5 9]

Predicted labels:
[7, 2, 1, 0, 4, 1, 4, 9, 5, 9]

Prediction probabilities:
[ 0.99972039  0.99675888  0.99558532  0.00526556  0.99759346  0.9485265
  0.04422363  0.99986506  0.0064677   0.99353302]
In [ ]:
### miscelanneous notes below
# In[32]: Understanding tensor shapes
#with tf.Session(graph=g) as sess:
#    print('input_x_2dimages:',tf_x_image.get_shape())
#shows the original, not the actual

# In[32]: Save
#with tf.Session(graph=g) as sess:
#    sess.run(init_op) #need to initialize the variables in the session first before can save them
#    saver.save(sess, save_path='./model_one.ckpt', global_step=epochs) #number the checkpoint by global_step - just a name

#model_one.ckpt-1.index saved to current dir, suffix is the epoch

# In[32]: Load session
#with tf.Session(graph=g) as sess:
#    saver.restore(sess, save_path='./model_one.ckpt-%d' % epoch) # find that checkpoint

# In[32]: The End

The End