Beyond Similarity

Crafting Neural Networks by hand

With some effort, you can do beautiful things

These are all libraries we need to construct a neural network.

import numpy as np
# Auxiliars
## Visualisation
import pandas as pd
from tqdm import trange
import seaborn as sns
import matplotlib.pyplot as plt

## Manage Files
import os

HyperParameters

ds_volume = 1.00                        # percentage of data used as dataset
threshold = 0.50                        # percentage of dataset used in training (vs. test)

dimensions = [400, 14, 10]              # size of each layer
alpha = 2.0                             # learning rate
lambda_t = 1.2e-2                       # L2 regularization parameter

steps = 200                             # number of iterations until stop
def to_multiclass(y, shape):
    ''' Convert an array of labels e.g. [1, 2, 9, 0, 6] in a table
        in which each row is an item from the list and each column
        is a digit (0-9). That table has value 1 for each row in
        the corresponing digit and 0 elsewhere.
    '''
    Y = np.zeros(shape)
    for i in range(y.shape[0]):
        Y[i, y[i]-1] = 1
    return Y
# Load data from .csv
all_x = np.genfromtxt('dataset/numbers.csv', delimiter=',')
all_y = np.genfromtxt('dataset/labels.csv', delimiter=',', dtype=np.int)

# Normalization 0-1
all_x = all_x.T
all_x = (all_x-all_x.min(0)) / all_x.ptp(0)
all_x = all_x.T

size = np.ceil(all_x.shape[0] * ds_volume)
indexes = np.random.choice(all_x.shape[0], size=size.astype(np.int), replace=False).astype(np.int)
thres = np.ceil(size * threshold).astype(np.int)

# Separate Training Set From Validation Set
ts = indexes[:thres]
vd = indexes[thres:]

# Dataset creation
dataset_i = all_x[ts]
dataset_o = all_y[ts]
dataset_o = to_multiclass(dataset_o, (dataset_o.shape[0], 10))

Examples visualisation

n_examples = 10
examples = np.random.choice(ts, n_examples)

plt.subplots(1, n_examples, figsize=(20, 4))
for i, example in enumerate(examples):
    plt.subplot(1, n_examples, i+1)
    plt.imshow(all_x[example].reshape(20, 20).T, cmap="gray")
    plt.title("Classified as {}".format(all_y[example] % 10))
    plt.axis('off')

img1

Initialization

theta = [[]]*(len(dimensions)-1)                     # Tensor of weights

for l in range(len(theta)):                          # Initialisation of weights: N(0, 1)
    theta[l] = np.random.normal(size=(dimensions[l]+1, dimensions[l+1]))  

Functions for training

def front_propagation(X, theta):
    s = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    a = [s]
    for theta_l in theta:
        s = sigmoid(s.dot(theta_l))
        s = np.concatenate((np.ones((s.shape[0], 1)), s), axis=1)
        a.append(s)
    a[-1] = np.delete(a[-1], 0, 1)
    return a
sigmoid = np.vectorize(lambda z: 1 / (1 + np.exp(-z)))
def cost(a, y, theta, lambda_t):
    m = y.shape[0]
    y = (y - 0.5)* 2
    y = y * a[-1]
    dcost = np.vectorize(lambda h: np.log(h) if h > 0 else np.log(1 + h))
    y = dcost(y)
    cost = -np.sum(y)
    for theta_l in theta:
        cost += lambda_t / 2 * np.sum(list(map(lambda x: x ** 2, theta_l)))
    return cost/m
def grad(a, y, theta, lambda_t):
    m = y.shape[0]
    d = [[]]*(len(theta))
    d[-1] = a[-1] - y
    for l in range(len(theta)-1, 0, -1):
        s = d[l]
        if l != len(theta)-1:
            s = np.delete(d[l], 0, 1)
        d[l-1] = s.dot(theta[l].T) * a[l] * (np.ones(a[l].shape) - a[l])
    D = [[]]*len(theta)
    D[-1] = np.sum(d[-1], axis=0)/m
    for l in range(len(d)-1, -1, -1):
        s = d[l]
        if l != len(d)-1:
            s = np.delete(s, 0, 1)
        D[l] = a[l].T.dot(s)/m
        zeros = np.zeros((1, theta[l].shape[1]))
        thets = np.delete(theta[l], 0, 0)
        D[l] += lambda_t * np.concatenate((zeros, thets), axis=0)
    return D
def numerical_grad(a, y, theta, lambda_t):
    e = 1e-4
    it = 0
    for l, theta_l in enumerate(theta):
        for i, row in enumerate(theta_l):
            for j, col in enumerate(row):
                theta[l][i][j] += e
                a[0] = np.delete(a[0], 0, 1)
                a = front_propagation(a[0], theta)
                cost_p = cost(a, y, theta, lambda_t)
                theta[l][i][j] -= 2 * e
                a[0] = np.delete(a[0], 0, 1)
                a = front_propagation(a[0], theta)
                cost_m = cost(a, y, theta, lambda_t)
                theta[l][i][j] += e
                grad = (cost_p - cost_m) / 2 / e
                yield (it, l, i, j, grad)
                it += 1
def shape(v, text):
    print("{}: ".format(text), end=' ')
    for i in v:
        print(i.shape, end=' ')
    print()

Training

# training
costs = pd.Series(index=[0])
for i in trange(steps):
    a = front_propagation(dataset_i, theta)
    costs[i] = cost(a, dataset_o, theta, lambda_t)
    g1 = grad(a, dataset_o, theta, lambda_t)
    for l in range(len(theta)):
        theta[l] -= g1[l] * alpha

if False: # DEBUG
    for it, l, i, j, num_grad in numerical_grad(a, dataset_o, theta, lambda_t):
        print("#[{:02d}][{:02d}][{:02d}] {:11.8f} =>{:11.8f} ({:}%)".format(l, i, j, 
                g1[l][i][j], num_grad, (g1[l][i][j] - num_grad)/num_grad*100))
        if it > 20:
            break
100%|██████████| 200/200 [01:16<00:00,  2.63it/s]
plt.figure(figsize=(20, 5))
plt.plot(costs)
plt.title('Model Progress')
plt.ylabel('Loss (MSE)')
plt.xlabel('# Epochs')
plt.show();

img2

Validation

hyperp_filename = 'hyperparameters.csv'
if os.path.exists(hyperp_filename):
    hyperp = pd.read_csv(hyperp_filename)
else:
    hyperp = pd.DataFrame(index=[], columns=["ds_volume", "threshold", "layers", "nodes_per_layer", "alpha", "lambda_t", "steps", "accuracy", "cost_ts", "cost_vd"])
test_i = all_x[vd]
test_o = all_y[vd]
test_o = to_multiclass(test_o, (test_o.shape[0], theta[-1].shape[1]))

guesses = np.zeros((test_o.shape[1]+1, test_o.shape[1]+1))

hipothesis = front_propagation(test_i, theta)[-1]
for i in range(hipothesis.shape[0]):
    guesses[np.argmax(test_o[i])+1, np.argmax(hipothesis[i])+1] += 1

accuracy = guesses.trace()/np.sum(guesses)*100
print("Accuracy: {:2.2f}%".format(accuracy))
print("Cost dataset   : {:.2f}".format(cost_ts))
cost_vd = cost(hipothesis, test_o, theta, lambda_t)
print("Cost validation: {:.2f}".format(cost_vd))

display(pd.DataFrame(guesses))

# saving
hyperp = hyperp.append({
    'ds_volume':ds_volume,
    'threshold':threshold,
    'layers': len(dimensions),
    'nodes_per_layer': dimensions[1],
    'alpha': alpha,
    'lambda_t': lambda_t,
    'steps': steps,
    'accuracy': accuracy,
    'cost_ts': cost_ts,
    'cost_vd': cost_vd
}, ignore_index=True)
hyperp.to_csv(hyperp_filename, index=None)
display(hyperp.sort_values(by='cost_vd', ascending=True))
Accuracy: 81.96%
Cost dataset   : 1.52
Cost validation: 4.48
0 1 2 3 4 5 6 7 8 9 10
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 243.0 0.0 2.0 0.0 1.0 1.0 2.0 0.0 0.0 0.0
2 0.0 8.0 230.0 3.0 5.0 0.0 4.0 5.0 1.0 0.0 6.0
3 0.0 5.0 14.0 226.0 0.0 5.0 1.0 5.0 0.0 0.0 3.0
4 0.0 5.0 6.0 0.0 220.0 2.0 8.0 3.0 0.0 5.0 2.0
5 0.0 6.0 2.0 33.0 3.0 178.0 9.0 0.0 0.0 0.0 25.0
6 0.0 7.0 3.0 0.0 0.0 4.0 223.0 0.0 0.0 0.0 4.0
7 0.0 17.0 3.0 0.0 2.0 1.0 0.0 222.0 0.0 0.0 3.0
8 0.0 21.0 12.0 42.0 0.0 11.0 3.0 1.0 146.0 6.0 4.0
9 0.0 5.0 0.0 3.0 17.0 4.0 2.0 83.0 1.0 116.0 7.0
10 0.0 0.0 0.0 1.0 0.0 0.0 3.0 1.0 0.0 0.0 245.0
ds_volume threshold layers nodes_per_layer alpha lambda_t steps accuracy cost_ts cost_vd
5 1.0 0.5 3.0 14.0 1.0 0.100 200.0 9.44 1.516239 3.255590
2 1.0 0.5 3.0 14.0 0.1 0.010 200.0 37.28 1.516239 3.437609
3 1.0 0.5 3.0 14.0 1.0 0.010 200.0 83.92 1.516239 3.814707
0 1.0 0.5 3.0 14.0 2.0 0.010 200.0 84.08 1.516239 4.160384
6 1.0 0.5 3.0 14.0 2.0 0.012 200.0 81.96 1.516239 4.478654
1 1.0 0.5 4.0 120.0 2.0 0.010 200.0 67.48 1.516239 4.785977
4 1.0 0.5 3.0 14.0 1.0 0.001 200.0 78.40 1.516239 6.431485
plt.figure(figsize=(16, 8))
sns.heatmap(guesses[1:, 1:])
plt.title('Confusion Matrix')
plt.show();

img3

hyperp.sort_values(by='accuracy', ascending=False).head()
# ds_volume threshold layers nodes_per_layer alpha lambda_t steps accuracy cost_ts cost_vd
0 1.0 0.5 3.0 14.0 2.0 0.010 200.0 84.08 1.516239 4.160384
3 1.0 0.5 3.0 14.0 1.0 0.010 200.0 83.92 1.516239 3.814707
6 1.0 0.5 3.0 14.0 2.0 0.012 200.0 81.96 1.516239 4.478654
4 1.0 0.5 3.0 14.0 1.0 0.001 200.0 78.40 1.516239 6.431485
1 1.0 0.5 4.0 120.0 2.0 0.010 200.0 67.48 1.516239 4.785977
plt.figure(figsize=(20, 5))
plt.scatter(hyperp['cost_vd'], hyperp['accuracy'])
plt.title('Hyperparameters results')
plt.xlabel('Loss in validation set')
plt.ylabel('Accuracy')
plt.show();

img4

Experiment #1 - What regions does the neural network looks for?

If we set a defined value output as answer and run the neural network backwards, we will produce an image. This image represents the regions in which the neural network “directs” its weights to decide the defined output, i.e., the most important regions for each digit.

# Set a digit value here
digit = 3
out_tensor = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
out_tensor[digit - 1] = 1 
k = []
k.append(np.dot(out_tensor, theta[1].T))
k[0] = k[0][1:]
k.append(np.dot(k[0], theta[0].T))
k[1] = k[1][1:]

plt.subplots(1, 2, figsize=(20, 5))
plt.subplot(1, 2, 1)
sns.heatmap(k[1].reshape(20, 20).T, center=0)
plt.title("Important regions for digit '{}'".format(digit))
plt.axis('off')
plt.subplot(1, 2, 2)
sns.heatmap(k[1].reshape(20, 20).T, cmap='gray')
plt.title("Important regions for digit '{}'".format(digit))
plt.axis('off')
plt.show();

digit-visualisation

.