Crafting Neural Networks by hand

With some effort, you can do beautiful things

These are all libraries we need to construct a neural network.

import numpy as np

# Auxiliars
## Visualisation
import pandas as pd
from tqdm import trange
import seaborn as sns
import matplotlib.pyplot as plt

## Manage Files
import os

HyperParameters

ds_volume = 1.00                        # percentage of data used as dataset
threshold = 0.50                        # percentage of dataset used in training (vs. test)

dimensions = [400, 14, 10]              # size of each layer
alpha = 2.0                             # learning rate
lambda_t = 1.2e-2                       # L2 regularization parameter

steps = 200                             # number of iterations until stop

def to_multiclass(y, shape):
    ''' Convert an array of labels e.g. [1, 2, 9, 0, 6] in a table
        in which each row is an item from the list and each column
        is a digit (0-9). That table has value 1 for each row in
        the corresponing digit and 0 elsewhere.
    '''
    Y = np.zeros(shape)
    for i in range(y.shape[0]):
        Y[i, y[i]-1] = 1
    return Y

# Load data from .csv
all_x = np.genfromtxt('dataset/numbers.csv', delimiter=',')
all_y = np.genfromtxt('dataset/labels.csv', delimiter=',', dtype=np.int)

# Normalization 0-1
all_x = all_x.T
all_x = (all_x-all_x.min(0)) / all_x.ptp(0)
all_x = all_x.T

size = np.ceil(all_x.shape[0] * ds_volume)
indexes = np.random.choice(all_x.shape[0], size=size.astype(np.int), replace=False).astype(np.int)
thres = np.ceil(size * threshold).astype(np.int)

# Separate Training Set From Validation Set
ts = indexes[:thres]
vd = indexes[thres:]

# Dataset creation
dataset_i = all_x[ts]
dataset_o = all_y[ts]
dataset_o = to_multiclass(dataset_o, (dataset_o.shape[0], 10))

Examples visualisation

n_examples = 10
examples = np.random.choice(ts, n_examples)

plt.subplots(1, n_examples, figsize=(20, 4))
for i, example in enumerate(examples):
    plt.subplot(1, n_examples, i+1)
    plt.imshow(all_x[example].reshape(20, 20).T, cmap="gray")
    plt.title("Classified as {}".format(all_y[example] % 10))
    plt.axis('off')

Initialization

theta = [[]]*(len(dimensions)-1)                     # Tensor of weights

for l in range(len(theta)):                          # Initialisation of weights: N(0, 1)
    theta[l] = np.random.normal(size=(dimensions[l]+1, dimensions[l+1]))  

Functions for training

def front_propagation(X, theta):
    s = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    a = [s]
    for theta_l in theta:
        s = sigmoid(s.dot(theta_l))
        s = np.concatenate((np.ones((s.shape[0], 1)), s), axis=1)
        a.append(s)
    a[-1] = np.delete(a[-1], 0, 1)
    return a

sigmoid = np.vectorize(lambda z: 1 / (1 + np.exp(-z)))

def cost(a, y, theta, lambda_t):
    m = y.shape[0]
    y = (y - 0.5)* 2
    y = y * a[-1]
    dcost = np.vectorize(lambda h: np.log(h) if h > 0 else np.log(1 + h))
    y = dcost(y)
    cost = -np.sum(y)
    for theta_l in theta:
        cost += lambda_t / 2 * np.sum(list(map(lambda x: x ** 2, theta_l)))
    return cost/m

def grad(a, y, theta, lambda_t):
    m = y.shape[0]
    d = [[]]*(len(theta))
    d[-1] = a[-1] - y
    for l in range(len(theta)-1, 0, -1):
        s = d[l]
        if l != len(theta)-1:
            s = np.delete(d[l], 0, 1)
        d[l-1] = s.dot(theta[l].T) * a[l] * (np.ones(a[l].shape) - a[l])
    D = [[]]*len(theta)
    D[-1] = np.sum(d[-1], axis=0)/m
    for l in range(len(d)-1, -1, -1):
        s = d[l]
        if l != len(d)-1:
            s = np.delete(s, 0, 1)
        D[l] = a[l].T.dot(s)/m
        zeros = np.zeros((1, theta[l].shape[1]))
        thets = np.delete(theta[l], 0, 0)
        D[l] += lambda_t * np.concatenate((zeros, thets), axis=0)
    return D

def numerical_grad(a, y, theta, lambda_t):
    e = 1e-4
    it = 0
    for l, theta_l in enumerate(theta):
        for i, row in enumerate(theta_l):
            for j, col in enumerate(row):
                theta[l][i][j] += e
                a[0] = np.delete(a[0], 0, 1)
                a = front_propagation(a[0], theta)
                cost_p = cost(a, y, theta, lambda_t)
                theta[l][i][j] -= 2 * e
                a[0] = np.delete(a[0], 0, 1)
                a = front_propagation(a[0], theta)
                cost_m = cost(a, y, theta, lambda_t)
                theta[l][i][j] += e
                grad = (cost_p - cost_m) / 2 / e
                yield (it, l, i, j, grad)
                it += 1

def shape(v, text):
    print("{}: ".format(text), end=' ')
    for i in v:
        print(i.shape, end=' ')
    print()

Training

# training
costs = pd.Series(index=[0])
for i in trange(steps):
    a = front_propagation(dataset_i, theta)
    costs[i] = cost(a, dataset_o, theta, lambda_t)
    g1 = grad(a, dataset_o, theta, lambda_t)
    for l in range(len(theta)):
        theta[l] -= g1[l] * alpha

if False: # DEBUG
    for it, l, i, j, num_grad in numerical_grad(a, dataset_o, theta, lambda_t):
        print("#[{:02d}][{:02d}][{:02d}] {:11.8f} =>{:11.8f} ({:}%)".format(l, i, j, 
                g1[l][i][j], num_grad, (g1[l][i][j] - num_grad)/num_grad*100))
        if it > 20:
            break

100%|██████████| 200/200 [01:16<00:00,  2.63it/s]

plt.figure(figsize=(20, 5))
plt.plot(costs)
plt.title('Model Progress')
plt.ylabel('Loss (MSE)')
plt.xlabel('# Epochs')
plt.show();

Validation

hyperp_filename = 'hyperparameters.csv'
if os.path.exists(hyperp_filename):
    hyperp = pd.read_csv(hyperp_filename)
else:
    hyperp = pd.DataFrame(index=[], columns=["ds_volume", "threshold", "layers", "nodes_per_layer", "alpha", "lambda_t", "steps", "accuracy", "cost_ts", "cost_vd"])

test_i = all_x[vd]
test_o = all_y[vd]
test_o = to_multiclass(test_o, (test_o.shape[0], theta[-1].shape[1]))

guesses = np.zeros((test_o.shape[1]+1, test_o.shape[1]+1))

hipothesis = front_propagation(test_i, theta)[-1]
for i in range(hipothesis.shape[0]):
    guesses[np.argmax(test_o[i])+1, np.argmax(hipothesis[i])+1] += 1

accuracy = guesses.trace()/np.sum(guesses)*100
print("Accuracy: {:2.2f}%".format(accuracy))
print("Cost dataset   : {:.2f}".format(cost_ts))
cost_vd = cost(hipothesis, test_o, theta, lambda_t)
print("Cost validation: {:.2f}".format(cost_vd))

display(pd.DataFrame(guesses))

# saving
hyperp = hyperp.append({
    'ds_volume':ds_volume,
    'threshold':threshold,
    'layers': len(dimensions),
    'nodes_per_layer': dimensions[1],
    'alpha': alpha,
    'lambda_t': lambda_t,
    'steps': steps,
    'accuracy': accuracy,
    'cost_ts': cost_ts,
    'cost_vd': cost_vd
}, ignore_index=True)
hyperp.to_csv(hyperp_filename, index=None)
display(hyperp.sort_values(by='cost_vd', ascending=True))

Accuracy: 81.96%
Cost dataset   : 1.52
Cost validation: 4.48

	1	2	3	4	5	6	7	8	9	10
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	243.0	0.0	2.0	0.0	1.0	1.0	2.0	0.0	0.0	0.0
2	8.0	230.0	3.0	5.0	0.0	4.0	5.0	1.0	0.0	6.0
3	5.0	14.0	226.0	0.0	5.0	1.0	5.0	0.0	0.0	3.0
4	5.0	6.0	0.0	220.0	2.0	8.0	3.0	0.0	5.0	2.0
5	6.0	2.0	33.0	3.0	178.0	9.0	0.0	0.0	0.0	25.0
6	7.0	3.0	0.0	0.0	4.0	223.0	0.0	0.0	0.0	4.0
7	17.0	3.0	0.0	2.0	1.0	0.0	222.0	0.0	0.0	3.0
8	21.0	12.0	42.0	0.0	11.0	3.0	1.0	146.0	6.0	4.0
9	5.0	0.0	3.0	17.0	4.0	2.0	83.0	1.0	116.0	7.0
10	0.0	0.0	1.0	0.0	0.0	3.0	1.0	0.0	0.0	245.0

	ds_volume	threshold	layers	nodes_per_layer	alpha	lambda_t	steps	accuracy	cost_ts	cost_vd
5	1.0	0.5	3.0	14.0	1.0	0.100	200.0	9.44	1.516239	3.255590
2	1.0	0.5	3.0	14.0	0.1	0.010	200.0	37.28	1.516239	3.437609
3	1.0	0.5	3.0	14.0	1.0	0.010	200.0	83.92	1.516239	3.814707
0	1.0	0.5	3.0	14.0	2.0	0.010	200.0	84.08	1.516239	4.160384
6	1.0	0.5	3.0	14.0	2.0	0.012	200.0	81.96	1.516239	4.478654
1	1.0	0.5	4.0	120.0	2.0	0.010	200.0	67.48	1.516239	4.785977
4	1.0	0.5	3.0	14.0	1.0	0.001	200.0	78.40	1.516239	6.431485

plt.figure(figsize=(16, 8))
sns.heatmap(guesses[1:, 1:])
plt.title('Confusion Matrix')
plt.show();

hyperp.sort_values(by='accuracy', ascending=False).head()

#	ds_volume	threshold	layers	nodes_per_layer	alpha	lambda_t	steps	accuracy	cost_ts	cost_vd
0	1.0	0.5	3.0	14.0	2.0	0.010	200.0	84.08	1.516239	4.160384
3	1.0	0.5	3.0	14.0	1.0	0.010	200.0	83.92	1.516239	3.814707
6	1.0	0.5	3.0	14.0	2.0	0.012	200.0	81.96	1.516239	4.478654
4	1.0	0.5	3.0	14.0	1.0	0.001	200.0	78.40	1.516239	6.431485
1	1.0	0.5	4.0	120.0	2.0	0.010	200.0	67.48	1.516239	4.785977

plt.figure(figsize=(20, 5))
plt.scatter(hyperp['cost_vd'], hyperp['accuracy'])
plt.title('Hyperparameters results')
plt.xlabel('Loss in validation set')
plt.ylabel('Accuracy')
plt.show();

Experiment #1 - What regions does the neural network looks for?

If we set a defined value output as answer and run the neural network backwards, we will produce an image. This image represents the regions in which the neural network “directs” its weights to decide the defined output, i.e., the most important regions for each digit.

# Set a digit value here
digit = 3

out_tensor = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
out_tensor[digit - 1] = 1 
k = []
k.append(np.dot(out_tensor, theta[1].T))
k[0] = k[0][1:]
k.append(np.dot(k[0], theta[0].T))
k[1] = k[1][1:]

plt.subplots(1, 2, figsize=(20, 5))
plt.subplot(1, 2, 1)
sns.heatmap(k[1].reshape(20, 20).T, center=0)
plt.title("Important regions for digit '{}'".format(digit))
plt.axis('off')
plt.subplot(1, 2, 2)
sns.heatmap(k[1].reshape(20, 20).T, cmap='gray')
plt.title("Important regions for digit '{}'".format(digit))
plt.axis('off')
plt.show();

digit-visualisation