Commit d873eb66 authored by Stefan Adalbjörnsson's avatar Stefan Adalbjörnsson
Browse files

RBM HW, prelim

parent 8893fe08
Work-log for HW3 RBM machines.
12/10: We looked at the two implementations considered for the exercise and settled on using
"rbm_MNIST_test.py". It was easy to get running and already created the visualization of the filter and Gibbs sampled images, i.e., the feature column vectors, corresponding to each hidden node. We also tried changing the number of hidden nodes to 10 just to verify that it would not be a good idea as was obvious when looking at the found features.
16/10: After reviewing the videos on the training we had a hard time understanding the updating scheme used in the code:
h0 = sample_prob(tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb))
v1 = sample_prob(tf.nn.sigmoid(
tf.matmul(h0, tf.transpose(rbm_w)) + rbm_vb))
h1 = tf.nn.sigmoid(tf.matmul(v1, rbm_w) + rbm_hb)
w_positive_grad = tf.matmul(tf.transpose(X), h0)
w_negative_grad = tf.matmul(tf.transpose(v1), h1)
Which we did not think was consistent with the description of the contrastive divergence method. We instead used:
h0 = sample_prob(tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb))
h2 = tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb)
v1 = sample_prob(tf.nn.sigmoid(
tf.matmul(h0, tf.transpose(rbm_w)) + rbm_vb))
nbr_gibbs=1
for nbr_Gibbs in range(nbr_gibbs):
h0 = sample_prob(tf.nn.sigmoid(tf.matmul(v1, rbm_w) + rbm_hb))
v1 = sample_prob(tf.nn.sigmoid(tf.matmul(h0, tf.transpose(rbm_w)) + rbm_vb))
h1 = tf.nn.sigmoid(tf.matmul(v1, rbm_w) + rbm_hb)
w_positive_grad = tf.matmul(tf.transpose(X), h2)
w_negative_grad = tf.matmul(tf.transpose(v1), h1)
Which also can use more Gibbs sampling steps by increasing nbr_gibbs. Did not seem to help.
Initializing the feature matrix with small Gaussian variables did not seem to help.
Next we implemented more iterations of the training data by creating an outer for-loop. Experimentally about 10 epochs was sufficient for reaching a local minimum.
We tried initializing the weight matrix with random variables.
By increasing the number of samples we got worse reconstruction error, ||X-\tilde(X)||, from 0.0704 to 0.0551 (10 steps) 0.0562 (40 steps). However, the computational time increased approximately 3x (10 steps) and 8x (40 steps).
Note that, it's hard to use the reconstruction error as an indication of performance and, for example, no significant difference was seen in the learned feature vectors.
We tried to implement a classifier as well, using a fully connected single layer using p(h|X) as input. The results were approx 96% accuracy, while a two layer fully connected model, using same size of hidden layers, achieved approx 97%.
#!/usr/bin/env python
"""Functions for downloading and reading MNIST data."""
import gzip
import os
from six.moves.urllib.request import urlretrieve
import numpy
SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
def maybe_download(filename, work_directory):
"""Download the data from Yann's website, unless it's already here."""
if not os.path.exists(work_directory):
os.mkdir(work_directory)
filepath = os.path.join(work_directory, filename)
if not os.path.exists(filepath):
filepath, _ = urlretrieve(SOURCE_URL + filename, filepath)
statinfo = os.stat(filepath)
print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
return filepath
def _read32(bytestream):
dt = numpy.dtype(numpy.uint32).newbyteorder('>')
return numpy.frombuffer(bytestream.read(4), dtype=dt)
def extract_images(filename):
"""Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
print('Extracting', filename)
with gzip.open(filename) as bytestream:
magic = _read32(bytestream)
if magic != 2051:
raise ValueError(
'Invalid magic number %d in MNIST image file: %s' %
(magic, filename))
num_images = _read32(bytestream)
rows = _read32(bytestream)
cols = _read32(bytestream)
buf = bytestream.read(rows * cols * num_images)
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_images, rows, cols, 1)
return data
def dense_to_one_hot(labels_dense, num_classes=10):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = numpy.arange(num_labels) * num_classes
labels_one_hot = numpy.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def extract_labels(filename, one_hot=False):
"""Extract the labels into a 1D uint8 numpy array [index]."""
print('Extracting', filename)
with gzip.open(filename) as bytestream:
magic = _read32(bytestream)
if magic != 2049:
raise ValueError(
'Invalid magic number %d in MNIST label file: %s' %
(magic, filename))
num_items = _read32(bytestream)
buf = bytestream.read(num_items)
labels = numpy.frombuffer(buf, dtype=numpy.uint8)
if one_hot:
return dense_to_one_hot(labels)
return labels
class DataSet(object):
def __init__(self, images, labels, fake_data=False):
if fake_data:
self._num_examples = 10000
else:
assert images.shape[0] == labels.shape[0], (
"images.shape: %s labels.shape: %s" % (images.shape,
labels.shape))
self._num_examples = images.shape[0]
# Convert shape from [num examples, rows, columns, depth]
# to [num examples, rows*columns] (assuming depth == 1)
assert images.shape[3] == 1
images = images.reshape(images.shape[0],
images.shape[1] * images.shape[2])
# Convert from [0, 255] -> [0.0, 1.0].
images = images.astype(numpy.float32)
images = numpy.multiply(images, 1.0 / 255.0)
self._images = images
self._labels = labels
self._epochs_completed = 0
self._index_in_epoch = 0
@property
def images(self):
return self._images
@property
def labels(self):
return self._labels
@property
def num_examples(self):
return self._num_examples
@property
def epochs_completed(self):
return self._epochs_completed
def next_batch(self, batch_size, fake_data=False):
"""Return the next `batch_size` examples from this data set."""
if fake_data:
fake_image = [1.0 for _ in xrange(784)]
fake_label = 0
return [fake_image for _ in xrange(batch_size)], [
fake_label for _ in xrange(batch_size)]
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# Finished epoch
self._epochs_completed += 1
# Shuffle the data
perm = numpy.arange(self._num_examples)
numpy.random.shuffle(perm)
self._images = self._images[perm]
self._labels = self._labels[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end]
def read_data_sets(train_dir, fake_data=False, one_hot=False):
class DataSets(object):
pass
data_sets = DataSets()
if fake_data:
data_sets.train = DataSet([], [], fake_data=True)
data_sets.validation = DataSet([], [], fake_data=True)
data_sets.test = DataSet([], [], fake_data=True)
return data_sets
TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
VALIDATION_SIZE = 5000
local_file = maybe_download(TRAIN_IMAGES, train_dir)
train_images = extract_images(local_file)
local_file = maybe_download(TRAIN_LABELS, train_dir)
train_labels = extract_labels(local_file, one_hot=one_hot)
local_file = maybe_download(TEST_IMAGES, train_dir)
test_images = extract_images(local_file)
local_file = maybe_download(TEST_LABELS, train_dir)
test_labels = extract_labels(local_file, one_hot=one_hot)
validation_images = train_images[:VALIDATION_SIZE]
validation_labels = train_labels[:VALIDATION_SIZE]
train_images = train_images[VALIDATION_SIZE:]
train_labels = train_labels[VALIDATION_SIZE:]
data_sets.train = DataSet(train_images, train_labels)
data_sets.validation = DataSet(validation_images, validation_labels)
data_sets.test = DataSet(test_images, test_labels)
return data_sets
import tensorflow as tf
import numpy as np
import input_data
from PIL import Image
from utils import tile_raster_images
import pdb
def sample_prob(probs):
return tf.nn.relu(
tf.sign(
probs - tf.random_uniform(tf.shape(probs))))
alpha = .5
batchsize = 100
n_hidden=200
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images,\
mnist.test.labels
X = tf.placeholder("float", [None, 784])
Y = tf.placeholder("float", [None, 10])
rbm_w = tf.placeholder("float", [784, n_hidden])
rbm_vb = tf.placeholder("float", [784])
rbm_hb = tf.placeholder("float", [n_hidden])
h0 = sample_prob(tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb))
h2 = tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb)
v1 = sample_prob(tf.nn.sigmoid(
tf.matmul(h0, tf.transpose(rbm_w)) + rbm_vb))
for nbr_Gibbs in range(0):
h0 = sample_prob(tf.nn.sigmoid(tf.matmul(v1, rbm_w) + rbm_hb))
v1 = sample_prob(tf.nn.sigmoid(tf.matmul(h0, tf.transpose(rbm_w)) + rbm_vb))
h1 = tf.nn.sigmoid(tf.matmul(v1, rbm_w) + rbm_hb)
w_positive_grad = tf.matmul(tf.transpose(X), h2)
w_negative_grad = tf.matmul(tf.transpose(v1), h1)
update_w = rbm_w + alpha * \
(w_positive_grad - w_negative_grad) / tf.to_float(tf.shape(X)[0])
update_vb = rbm_vb + alpha * tf.reduce_mean(X - v1, 0)
update_hb = rbm_hb + alpha * tf.reduce_mean(h0 - h1, 0)
h_sample = sample_prob(tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb))
v_sample = sample_prob(tf.nn.sigmoid(
tf.matmul(h_sample, tf.transpose(rbm_w)) + rbm_vb))
err = X - v_sample
err_sum = tf.reduce_mean(err * err)
sess = tf.Session()
n_w = np.zeros([784,n_hidden], np.float32)
n_vb = np.zeros([784], np.float32)
n_hb = np.zeros([n_hidden], np.float32)
#o_w = np.zeros([784, n_hidden], np.float32)
o_w = np.random.normal(loc=0, scale=0.1, size=[784, n_hidden])*0
o_vb = np.zeros([784], np.float32)
o_hb = np.zeros([n_hidden], np.float32)
init = tf.initialize_all_variables()
sess.run(init)
nbr_epocs=10
print len(trX)
print sess.run(
err_sum, feed_dict={X: trX, rbm_w: o_w, rbm_vb: o_vb, rbm_hb: o_hb})
for iter in range(nbr_epocs):
IndexShuffle=np.random.permutation(len(trX))
for start, end in zip(
range(0, len(trX), batchsize), range(batchsize, len(trX), batchsize)):
batch = trX[np.ix_(IndexShuffle[start:end])]
n_w = sess.run(update_w, feed_dict={
X: batch, rbm_w: o_w, rbm_vb: o_vb, rbm_hb: o_hb})
n_vb = sess.run(update_vb, feed_dict={
X: batch, rbm_w: o_w, rbm_vb: o_vb, rbm_hb: o_hb})
n_hb = sess.run(update_hb, feed_dict={
X: batch, rbm_w: o_w, rbm_vb: o_vb, rbm_hb: o_hb})
o_w = n_w
o_vb = n_vb
o_hb = n_hb
o_v1 = sess.run(v1, feed_dict={
X: batch, rbm_w: o_w, rbm_vb: o_vb, rbm_hb: o_hb})
if start % 10000 == 0:
print sess.run(
err_sum, feed_dict={X: trX, rbm_w: n_w, rbm_vb: n_vb, rbm_hb: n_hb})
image = Image.fromarray(
tile_raster_images(
X=n_w.T,
img_shape=(28, 28),
tile_shape=(25, 20),
tile_spacing=(1, 1)
)
)
imageGibbs = Image.fromarray(
tile_raster_images(
X=o_v1,
img_shape=(28, 28),
tile_shape=(25, 20),
tile_spacing=(1, 1)
)
)
imageGibbs.save("Gibbsrbm.png")
image.save("Wrbm.png")
pdb.set_trace()
h2 = tf.nn.sigmoid(tf.matmul(X, rbm_w) + rbm_hb)
import pickle
# obj0, obj1, obj2 are created here...
# Saving the objects:
with open('objs.pickle', 'w') as f: pickle.dump([n_w, n_hb], f)
import tensorflow as tf
import numpy as np
import input_data
from PIL import Image
from utils import tile_raster_images
import pdb
import pickle
# Getting back the trained variables:
with open('objs.pickle') as f:
n_w, n_hb = pickle.load(f)
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images,\
mnist.test.labels
x = tf.placeholder(tf.float32, [None, 784])
h2 = tf.nn.sigmoid(tf.matmul(x, n_w) + n_hb)
W = tf.Variable(tf.truncated_normal([200, 10],stddev=0.1))
b = tf.Variable(tf.zeros([10])+0.1)
y = tf.nn.softmax(tf.matmul(h2, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y+1e-13), reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(10000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i%100==0:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
import tensorflow as tf
import numpy as np
import input_data
from PIL import Image
from utils import tile_raster_images
import pdb
import pickle
# Getting back the trained variables:
with open('objs.pickle') as f:
n_w, n_hb = pickle.load(f)
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images,\
mnist.test.labels
x = tf.placeholder(tf.float32, [None, 784])
W1 = tf.Variable(tf.truncated_normal([784, 200],stddev=0.1))
b1 = tf.Variable(tf.zeros([200])+0.1)
h2 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
#y = tf.nn.softmax(tf.matmul(x, W1) + b1)
W = tf.Variable(tf.truncated_normal([200, 10],stddev=0.1))
b = tf.Variable(tf.zeros([10])+0.1)
y = tf.nn.softmax(tf.matmul(h2, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y+1e-13), reduction_indices=[1]))
#train_step = tf.train.GradientDescentOptimizer(1).minimize(cross_entropy)
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(10000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i%100==0:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
""" This file contains different utility functions that are not connected
in anyway to the networks presented in the tutorials, but rather help in
processing the outputs into a more understandable way.
For example ``tile_raster_images`` helps in generating a easy to grasp
image from a set of samples or weights.
"""
import numpy
def scale_to_unit_interval(ndar, eps=1e-8):
""" Scales all values in the ndarray ndar to be between 0 and 1 """
ndar = ndar.copy()
ndar -= ndar.min()
ndar *= 1.0 / (ndar.max() + eps)
return ndar
def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
scale_rows_to_unit_interval=True,
output_pixel_vals=True):
"""
Transform an array with one flattened image per row, into an array in
which images are reshaped and layed out like tiles on a floor.
This function is useful for visualizing datasets whose rows are images,
and also columns of matrices for transforming those rows
(such as the first layer of a neural net).
:type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
be 2-D ndarrays or None;
:param X: a 2-D array in which every row is a flattened image.
:type img_shape: tuple; (height, width)
:param img_shape: the original shape of each image
:type tile_shape: tuple; (rows, cols)
:param tile_shape: the number of images to tile (rows, cols)
:param output_pixel_vals: if output should be pixel values (i.e. int8
values) or floats
:param scale_rows_to_unit_interval: if the values need to be scaled before
being plotted to [0,1] or not
:returns: array suitable for viewing as an image.
(See:`Image.fromarray`.)
:rtype: a 2-d array with same dtype as X.
"""
assert len(img_shape) == 2
assert len(tile_shape) == 2
assert len(tile_spacing) == 2
# The expression below can be re-written in a more C style as
# follows :
#
# out_shape = [0,0]
# out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
# tile_spacing[0]
# out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
# tile_spacing[1]
out_shape = [
(ishp + tsp) * tshp - tsp
for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
]
if isinstance(X, tuple):
assert len(X) == 4
# Create an output numpy ndarray to store the image
if output_pixel_vals:
out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
dtype='uint8')
else:
out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
dtype=X.dtype)
#colors default to 0, alpha defaults to 1 (opaque)
if output_pixel_vals:
channel_defaults = [0, 0, 0, 255]
else:
channel_defaults = [0., 0., 0., 1.]
for i in range(4):
if X[i] is None:
# if channel is None, fill it with zeros of the correct
# dtype
dt = out_array.dtype
if output_pixel_vals:
dt = 'uint8'
out_array[:, :, i] = numpy.zeros(
out_shape,
dtype=dt
) + channel_defaults[i]
else:
# use a recurrent call to compute the channel and store it
# in the output
out_array[:, :, i] = tile_raster_images(
X[i], img_shape, tile_shape, tile_spacing,
scale_rows_to_unit_interval, output_pixel_vals)
return out_array
else:
# if we are dealing with only one channel
H, W = img_shape
Hs, Ws = tile_spacing
# generate a matrix to store the output
dt = X.dtype
if output_pixel_vals:
dt = 'uint8'
out_array = numpy.zeros(out_shape, dtype=dt)
for tile_row in range(tile_shape[0]):
for tile_col in range(tile_shape[1]):
if tile_row * tile_shape[1] + tile_col < X.shape[0]:
this_x = X[tile_row * tile_shape[1] + tile_col]