# Implementing single layer, fully connected, neural network

First, we will import the libraries that we need...

In [None]:
import tensorflow as tf
import numpy as np

# taking real data from one of the built-in tutorials:
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import OneHotEncoder
from random import shuffle

Some tools that we need, nothing interesting here...

In [None]:
def load_data():
 '''
 Returns two dictionaries (train and test), each has the keys "X" and 
 "y" for the examples and the labels accordingly.
 X is a matrix where the columns are different features.
 '''
 print('loading data... (first time might take some time)') 
 mnist = fetch_mldata('MNIST original')
 X = mnist["data"] / 255.0
 y = OneHotEncoder(n_values=10, sparse=False).fit_transform(mnist["target"].reshape([-1, 1]))
 split_index = int(len(y)*0.8)
 indices = np.random.permutation(len(y))
 train_part, test_part = indices[:split_index], indices[split_index:]
 train = {'X': X[train_part, :], 'y': y[train_part, :]}
 test = {'X': X[test_part, :], 'y': y[test_part, :] }
 print('Data loaded successfully.\n')
 return train, test
 
 
def calc_accurecy(predictions, true_classes):
 '''
 The average count of correct predictions in precentages 
 ''' 
 true_labels_as_integers = np.argmax(true_classes, 1)
 return np.mean(np.equal(predictions, true_labels_as_integers).astype(np.float32)) * 100
 
 
class CyclicDataIter(object):
 '''
 Construct an iterator which accepts dictionary of the data
 with keys: "X" and "y", and a batch size. Then this object behaves as an infinite
 cyclic iterator over the data (with wrap around). 
 Each iteration returns subset of the rows of X and subset of corresponding coordinates in y,
 those subsets are of batch_size each.
 '''
 
 def __init__(self, data_dict, batch_size):
 
 self._X, self._y = data_dict['X'], data_dict['y']
 self._batch_size = batch_size
 self._i = 0
 self._max_i = self._X.shape[0]
 
 def __iter__(self):
 return self
 
 def next(self):
 end_i = self._i + self._batch_size
 
 if end_i <= self._max_i:
 locations = range(self._i, end_i)
 self._i = end_i
 
 else: 
 locations = list(range(self._i, self._max_i))
 self._i = end_i % self._max_i
 locations.extend(range(self._i))
 self._i = end_i % self._max_i
 return self._X[locations, :], self._y[locations, :]

 def __next__(self):
 return self.next()


# Construction Phase

We start by loading the data

In [None]:
train, test = load_data() 

print('The keys in each dictionary are:', train.keys(),'and', test.keys())
print('The dimenstions of the training sets are', train['X'].shape, train['y'].shape)
print('The dimenstions of the testing sets are', test['X'].shape, test['y'].shape)

In [None]:
train_size, num_features = train['X'].shape
num_classes = 10

Now constructing the inputs to the network: 

Consuder the _shape_ property of the tensor that will denotes the inputs.
Obviously it has to be two-dimensional, but what will be the size of the first dimension? 

The problem is that we have different number of __rows__ for the trainig set and for the test set. Therefore, the number of rows in our placeholder has to be flexible!

Introducting the "None" value into tensors shape.

In [None]:
data_input = tf.placeholder(shape=[None, num_features],
 dtype=tf.float32,
 name='inputs_placeholder')

true_label = tf.placeholder(shape=[None, num_classes],
 dtype=tf.float32,
 name='true_labels_placeholder')

In [None]:
print('The shape of the data is ', data_input.get_shape())
print('The actual shape values are:', data_input.get_shape().as_list())

Create the _parameters_ that the network will try to learn:

In [None]:
weights = tf.Variable(initial_value=tf.random_normal(shape=[num_features, num_classes], mean=0.0, stddev=0.1),
 name='weights')

biases = tf.Variable(initial_value=tf.random_normal(shape=[num_classes], mean=0.0, stddev=0.1),
 name='biases')

Define the computations and keep the resulting tensors, we will use them later.

In [None]:
layer_value = tf.nn.bias_add(tf.matmul(data_input, weights, name='mult_by_weights'), 
 biases, 
 name='adding_bias')

# Alternative:
#layer_value = tf.matmul(data_input, weights, name='mult_by_weights') + biases

probabilities = tf.nn.softmax(layer_value, name='applying_activation')

In [None]:
# check the dimensions of the outputs!
print(probabilities.get_shape())

Observe that for each example, the network outputs a **probability** vector of length 10 (due to the _softmax_ activation). Those probabilities reflects what the network believe is the distribution of classes over the given example. In order to get actual prediction, we will return the _index_ of the coordinate with the largest probability.

In [None]:
predictions = tf.argmax(probabilities, axis=1, name='predictions')
print(predictions.get_shape())

Here is a simple trick to check which operations the graph contains (note that TF breaks some operations into sub operations so the list can be a bit longer than expected)

In [None]:
print('\n'.join(op.name for op in tf.get_default_graph().get_operations()))

#### Important side note about _reshaping_:

Say that we wants to multiply the _predictions_ with a matrix. We will fail to apply tf.matmul because both inputs has to be matrices (whereas our predictions is a vector). The solution is to reshape the predictions into new container - a matrix with single column. It can be done as follows:

In [None]:
vector = tf.constant(1, shape=[100])
matrix = tf.constant(0, shape=[10, 10])

vector_to_matrix = tf.reshape(vector, [100, 1])
matrix_to_vector = tf.reshape(matrix, [-1])
high_order_tensor = tf.reshape(matrix, [2, 5, 5, -1])

print('"vector" had shape', vector.get_shape(),'and now:', vector_to_matrix.get_shape())
print('"matrix" had shape', matrix.get_shape(),'and now:', matrix_to_vector.get_shape())

print('high order tensor has shape:', high_order_tensor.get_shape())

## Trying to run the network!

We need to add one more operation to the graph:

In [None]:
init_all_variables_op = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
 init_all_variables_op.run() # why don't we pass the session object?

 # performance on test set:
 preds = sess.run(predictions, feed_dict={data_input: test['X']}) # recall that preds is a np.ndarray object
 score = calc_accurecy(preds, test['y'])

 print('Accuracy of the network on test set (in %%): %s' % score)

# It left now to _train_ the network

In order to _learn_ we need to define the loss function and a learning algorithm

In [None]:
loss_tensor = tf.reduce_mean(-tf.reduce_sum(true_label * tf.log(probabilities), axis=1)) # computing cross entropy
learn_op = tf.train.GradientDescentOptimizer(0.5).minimize(loss_tensor)

Now we will train before predicting on the test set:

In [None]:
with tf.Session() as sess:
 init_all_variables_op.run() 
 
 # training:
 train_iterator = CyclicDataIter(train, batch_size=100)
 for i in range(1000):
 x_batch, y_batch = next(train_iterator) 
 sess.run(learn_op, feed_dict={data_input: x_batch, true_label: y_batch})

 # performance on test set: 
 preds = sess.run(predictions, feed_dict={data_input: test['X']})
 score = calc_accurecy(preds, test['y'])

 print('Accuracy of the network on test set (in %%): %s' % score)