diff --git a/hw11_natural_language_processing/martink_word2vec/printouts b/hw11_natural_language_processing/martink_word2vec/printouts new file mode 100644 index 0000000000000000000000000000000000000000..fb384a63c4aa361aad9f62fd09e39c3bc82b771a --- /dev/null +++ b/hw11_natural_language_processing/martink_word2vec/printouts @@ -0,0 +1,244 @@ +bash-4.3$ python word2vec_basic.py +Found and verified text8.zip +Data size 17005207 +Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)] +Sample data [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against'] +3084 originated -> 12 as +3084 originated -> 5239 anarchism +12 as -> 3084 originated +12 as -> 6 a +6 a -> 195 term +6 a -> 12 as +195 term -> 2 of +195 term -> 6 a +Initialized +Average loss at step 0 : 260.113525391 +Nearest to a: expired, colony, vassar, documentation, renamo, parades, uncompetitive, garbled, +Nearest to years: rolf, sciences, arid, subtlety, benzene, prefecture, ventilated, tupac, +Nearest to may: gabor, moussa, marathon, mccain, mondo, robbie, ordinations, koenigsegg, +Nearest to new: reznor, monophosphate, bonnie, kornilov, subordinates, arcas, holiness, offices, +Nearest to all: unfold, newton, campbell, rote, dong, kabbalist, newly, tice, +Nearest to used: collision, contents, phonetic, besht, fondly, unguarded, millay, servicio, +Nearest to called: landing, pi, acetylcholine, attorney, entrance, importer, crowe, position, +Nearest to that: prohibit, habit, comedian, mythical, mir, financi, sweyn, mages, +Nearest to three: reeves, wounding, blass, humans, myth, leukemia, ritually, manitoba, +Nearest to the: consisting, gibb, oman, seleucids, semiarid, promote, vectorborne, macs, +Nearest to when: interference, carnivores, wco, eine, suppress, continual, linton, teachings, +Nearest to its: vidzeme, aoc, missile, grapple, caterpillars, accessed, aaas, boyle, +Nearest to use: rigorously, ashoka, weizenbaum, shedding, white, thuringiensis, gettysburg, expectation, +Nearest to he: dont, hutchins, whit, withhold, reina, push, anatolian, illustrious, +Nearest to six: equilateral, viscosity, self, zed, epa, ecuadorian, pap, gisela, +Nearest to s: mendeleev, vulgaris, pack, zangger, blind, respectful, attica, grenades, +Average loss at step 2000 : 113.902854166 +Average loss at step 4000 : 52.8573114848 +Average loss at step 6000 : 33.0519484727 +Average loss at step 8000 : 24.1580560323 +Average loss at step 10000 : 17.9564257891 +Nearest to a: the, this, and, UNK, influence, scrimmage, victoriae, analogue, +Nearest to years: sciences, amsterdam, nine, victoriae, six, arid, mathbf, reginae, +Nearest to may: meeting, developers, psychology, plot, spread, amyotrophic, alpina, koenigsegg, +Nearest to new: and, gb, circles, go, offices, proposal, vesta, seam, +Nearest to all: newton, reach, newly, campbell, absence, austin, history, coordinate, +Nearest to used: phonetic, contents, mya, breeds, left, decline, disagree, appeal, +Nearest to called: position, landing, attorney, railway, pi, entrance, split, vichy, +Nearest to that: never, victoriae, and, habit, johan, photographs, reality, austin, +Nearest to three: gb, mathbf, gland, tubing, nine, eight, zero, victoriae, +Nearest to the: a, UNK, and, his, victoriae, mathbf, one, its, +Nearest to when: interference, austin, suppress, eine, carnivores, mathbf, teachings, unfiltered, +Nearest to its: the, accessed, version, striking, victoriae, pure, a, pierce, +Nearest to use: gland, recommend, white, rigorously, accept, austin, politically, gettysburg, +Nearest to he: it, mathbf, and, gollancz, gland, you, an, they, +Nearest to six: nine, zero, mathbf, vs, phi, reginae, one, victoriae, +Nearest to s: and, sahara, troops, currently, the, zero, boroughs, of, +Average loss at step 12000 : 13.8088547783 +Average loss at step 14000 : 11.5112998215 +Average loss at step 16000 : 9.71534106708 +Average loss at step 18000 : 8.59706710231 +Average loss at step 20000 : 7.75074116433 +Nearest to a: the, this, agouti, or, adiabatic, and, culture, agnostic, +Nearest to years: amsterdam, sciences, benzene, four, victoriae, six, five, happens, +Nearest to may: zee, developers, psychology, polyhedra, extremophiles, precisely, meeting, koenigsegg, +Nearest to new: circles, holiness, and, gb, proposal, offices, seam, vesta, +Nearest to all: polyhedra, newton, history, ceiling, reach, coordinate, the, absence, +Nearest to used: contents, breeds, phonetic, decline, disagree, mya, dasyprocta, left, +Nearest to called: position, pi, attorney, exchanging, landing, entrance, railway, split, +Nearest to that: which, habit, and, johan, never, victoriae, in, dasyprocta, +Nearest to three: eight, five, two, six, seven, zero, nine, dasyprocta, +Nearest to the: a, his, its, agouti, their, one, victoriae, adiabatic, +Nearest to when: austin, interference, suppress, eine, carnivores, mathbf, bosnia, and, +Nearest to its: the, his, their, aoc, tumor, accessed, metis, agouti, +Nearest to use: ashoka, gland, rigorously, recommend, polyhedra, accept, shedding, gettysburg, +Nearest to he: it, they, and, who, mathbf, there, was, zero, +Nearest to six: nine, eight, zero, four, seven, three, five, mathbf, +Nearest to s: and, zero, the, his, of, dasyprocta, sahara, troops, +Average loss at step 22000 : 7.29002120793 +Average loss at step 24000 : 6.93063378406 +Average loss at step 26000 : 6.62375506437 +Average loss at step 28000 : 6.15877870786 +Average loss at step 30000 : 6.14214335346 +Nearest to a: the, this, agouti, or, aba, abet, victoriae, their, +Nearest to years: amsterdam, sciences, benzene, four, victoriae, five, subtlety, happens, +Nearest to may: zee, can, trudeau, developers, extremophiles, nine, koenigsegg, jack, +Nearest to new: circles, holiness, yangon, seam, offices, gb, proposal, profound, +Nearest to all: polyhedra, newton, history, absence, ceiling, virgin, coordinate, unfold, +Nearest to used: contents, breeds, decline, disagree, left, phonetic, appeal, dasyprocta, +Nearest to called: exchanging, pi, position, iic, attorney, entrance, champlain, split, +Nearest to that: which, habit, never, johan, victoriae, polyhedra, antitrust, objective, +Nearest to three: eight, four, five, six, seven, two, nine, zero, +Nearest to the: their, its, his, a, agouti, adiabatic, this, victoriae, +Nearest to when: primigenius, austin, suppress, and, eine, with, interference, mathbf, +Nearest to its: the, their, his, aoc, metis, tumor, accessed, a, +Nearest to use: ashoka, gland, rigorously, polyhedra, recommend, accept, gettysburg, rooms, +Nearest to he: it, they, she, who, and, there, mathbf, gotten, +Nearest to six: eight, nine, four, seven, five, three, zero, two, +Nearest to s: and, his, of, the, zero, or, two, dasyprocta, +Average loss at step 32000 : 5.85488548732 +Average loss at step 34000 : 5.83861638045 +Average loss at step 36000 : 5.72251720762 +Average loss at step 38000 : 5.25196716714 +Average loss at step 40000 : 5.48282074535 +Nearest to a: the, agouti, or, albury, this, aba, victoriae, abatis, +Nearest to years: amsterdam, six, sciences, benzene, four, five, victoriae, reuptake, +Nearest to may: can, zee, trudeau, should, will, eight, might, would, +Nearest to new: circles, goo, albury, gb, monophosphate, holiness, profound, yangon, +Nearest to all: polyhedra, newton, history, absence, coordinate, virgin, roper, mathbf, +Nearest to used: decline, dasyprocta, breeds, contents, disagree, mya, monatomic, continuum, +Nearest to called: exchanging, pi, position, iic, acetylcholine, entrance, champlain, split, +Nearest to that: which, this, what, it, polyhedra, never, victoriae, habit, +Nearest to three: four, six, five, eight, seven, two, zero, nine, +Nearest to the: its, a, adiabatic, their, his, this, agouti, abet, +Nearest to when: primigenius, austin, with, suppress, eine, including, and, but, +Nearest to its: their, the, his, tumor, metis, aoc, agouti, abet, +Nearest to use: ashoka, gland, polyhedra, recommend, rigorously, accept, gettysburg, rooms, +Nearest to he: it, she, they, who, there, mathbf, but, gotten, +Nearest to six: seven, eight, four, five, nine, three, zero, two, +Nearest to s: and, his, dasyprocta, cinque, the, metis, two, kifl, +Average loss at step 42000 : 5.32659191382 +Average loss at step 44000 : 5.27528748202 +Average loss at step 46000 : 5.24571657467 +Average loss at step 48000 : 5.05064962864 +Average loss at step 50000 : 5.15474711931 +Nearest to a: the, expired, agouti, hg, thibetanus, aba, appomattox, dipyramid, +Nearest to years: amsterdam, four, six, hoax, sciences, happens, five, victoriae, +Nearest to may: can, will, might, should, would, must, trudeau, eight, +Nearest to new: circles, albury, naaman, seam, scientifically, holiness, goo, monophosphate, +Nearest to all: polyhedra, newton, two, absence, mathbf, history, acts, three, +Nearest to used: decline, dasyprocta, breeds, handicap, monatomic, found, known, disagree, +Nearest to called: exchanging, iic, pi, position, acetylcholine, split, champlain, entrance, +Nearest to that: which, what, this, never, polyhedra, victoriae, naaman, but, +Nearest to three: four, six, seven, five, two, eight, one, nine, +Nearest to the: its, their, his, agouti, adiabatic, a, this, victoriae, +Nearest to when: primigenius, but, austin, eight, suppress, and, five, seven, +Nearest to its: their, the, his, agouti, tumor, metis, aoc, accessed, +Nearest to use: ashoka, polyhedra, gland, rigorously, recommend, accept, gettysburg, rooms, +Nearest to he: it, she, they, who, there, this, gotten, mathbf, +Nearest to six: eight, four, seven, five, three, nine, one, zero, +Nearest to s: his, zero, dasyprocta, and, cinque, the, was, metis, +Average loss at step 52000 : 5.18477400446 +Average loss at step 54000 : 5.11114428246 +Average loss at step 56000 : 5.03586944163 +Average loss at step 58000 : 5.17337947047 +Average loss at step 60000 : 4.93142760962 +Nearest to a: the, ssbn, wct, thibetanus, agouti, callithrix, cebus, aba, +Nearest to years: four, amsterdam, six, microcebus, five, months, hoax, victoriae, +Nearest to may: can, will, would, might, should, must, could, trudeau, +Nearest to new: circles, naaman, albury, goo, seam, scientifically, callithrix, monophosphate, +Nearest to all: microcebus, polyhedra, two, callithrix, cebus, acts, three, mathbf, +Nearest to used: decline, handicap, microcebus, dasyprocta, found, continuum, known, cebus, +Nearest to called: exchanging, pi, ssbn, acetylcholine, iic, split, champlain, tom, +Nearest to that: which, this, what, never, cebus, it, objective, tamarin, +Nearest to three: five, four, six, seven, two, eight, nine, one, +Nearest to the: its, their, a, this, adiabatic, callithrix, agouti, his, +Nearest to when: primigenius, after, austin, but, suppress, five, tamarin, and, +Nearest to its: their, his, the, callithrix, her, agouti, cebus, metis, +Nearest to use: ashoka, polyhedra, gland, rigorously, gettysburg, recommend, microcebus, expectation, +Nearest to he: it, she, they, who, there, callithrix, but, ssbn, +Nearest to six: eight, five, four, seven, nine, three, zero, callithrix, +Nearest to s: his, wct, and, dasyprocta, callithrix, zero, was, cinque, +Average loss at step 62000 : 4.80653275287 +Average loss at step 64000 : 4.80298928308 +Average loss at step 66000 : 4.9778061583 +Average loss at step 68000 : 4.92880989146 +Average loss at step 70000 : 4.78815213633 +Nearest to a: the, ssbn, upanija, wct, cebus, agouti, callithrix, expired, +Nearest to years: four, five, months, amsterdam, microcebus, six, hoax, reuptake, +Nearest to may: can, will, would, might, should, must, could, to, +Nearest to new: circles, naaman, goo, albury, seam, scientifically, monophosphate, yangon, +Nearest to all: many, tico, microcebus, some, polyhedra, callithrix, various, acts, +Nearest to used: known, found, handicap, decline, microcebus, dasyprocta, cebus, agouti, +Nearest to called: exchanging, ssbn, pi, champlain, acetylcholine, tom, ecc, iic, +Nearest to that: which, what, this, never, cebus, objective, dinar, tamarin, +Nearest to three: four, five, six, seven, two, eight, callithrix, nine, +Nearest to the: its, their, this, callithrix, agouti, a, adiabatic, wct, +Nearest to when: after, primigenius, austin, but, as, suppress, tamarin, with, +Nearest to its: their, his, the, callithrix, her, cebus, metis, agouti, +Nearest to use: polyhedra, ashoka, gland, unassigned, microcebus, callithrix, expectation, rigorously, +Nearest to he: it, she, they, who, there, callithrix, but, ssbn, +Nearest to six: eight, four, five, seven, three, nine, zero, callithrix, +Nearest to s: wct, his, dasyprocta, zero, thz, cinque, callithrix, or, +Average loss at step 72000 : 4.80325884366 +Average loss at step 74000 : 4.78602541548 +Average loss at step 76000 : 4.89614222682 +Average loss at step 78000 : 4.78169331312 +Average loss at step 80000 : 4.80316182685 +Nearest to a: the, ssbn, wct, upanija, cegep, thighs, dipyramid, agouti, +Nearest to years: months, four, microcebus, amsterdam, hoax, five, reuptake, happens, +Nearest to may: can, will, would, might, should, must, could, to, +Nearest to new: circles, naaman, goo, seam, albury, scientifically, prague, monophosphate, +Nearest to all: microcebus, many, two, tico, some, callithrix, polyhedra, these, +Nearest to used: known, found, handicap, decline, dasyprocta, microcebus, agouti, cebus, +Nearest to called: exchanging, ssbn, hood, champlain, protested, customization, pi, ecc, +Nearest to that: which, what, this, objective, however, cebus, naaman, tamarin, +Nearest to three: four, five, six, two, seven, eight, callithrix, one, +Nearest to the: its, callithrix, their, agouti, wct, his, microsite, this, +Nearest to when: after, clodius, austin, but, primigenius, tamarin, because, five, +Nearest to its: their, his, the, her, callithrix, metis, agouti, tumor, +Nearest to use: polyhedra, ashoka, gland, cegep, unassigned, accept, microcebus, callithrix, +Nearest to he: it, she, they, who, there, callithrix, professions, iit, +Nearest to six: five, four, eight, seven, three, nine, two, zero, +Nearest to s: wct, dasyprocta, zero, his, cinque, callithrix, masterpieces, thz, +Average loss at step 82000 : 4.80276642525 +Average loss at step 84000 : 4.78886307824 +Average loss at step 86000 : 4.73723094189 +Average loss at step 88000 : 4.69320081282 +Average loss at step 90000 : 4.75272560072 +Nearest to a: the, ssbn, wct, upanija, any, another, cegep, dipyramid, +Nearest to years: months, microcebus, four, amsterdam, hoax, five, reuptake, vannevar, +Nearest to may: can, will, would, might, should, must, could, cannot, +Nearest to new: circles, naaman, goo, one, albury, thaler, gb, seam, +Nearest to all: some, many, microcebus, tico, these, callithrix, various, polyhedra, +Nearest to used: known, found, microcebus, handicap, dasyprocta, seen, agouti, decline, +Nearest to called: exchanging, hood, ssbn, split, customization, thaler, protested, champlain, +Nearest to that: which, what, however, this, but, cebus, dinar, tamarin, +Nearest to three: four, two, five, seven, eight, six, callithrix, cegep, +Nearest to the: its, their, callithrix, agouti, wct, adiabatic, a, cegep, +Nearest to when: after, clodius, but, if, before, tamarin, because, primigenius, +Nearest to its: their, his, the, her, callithrix, agouti, metis, wct, +Nearest to use: polyhedra, ashoka, gland, cegep, callithrix, microcebus, clodius, catalysis, +Nearest to he: she, it, they, there, who, callithrix, iit, zero, +Nearest to six: seven, eight, five, four, nine, three, zero, callithrix, +Nearest to s: wct, dasyprocta, his, thz, chalcedon, cinque, zero, mating, +Average loss at step 92000 : 4.71442663682 +Average loss at step 94000 : 4.60804726839 +Average loss at step 96000 : 4.72456447947 +Average loss at step 98000 : 4.62782734013 +Average loss at step 100000 : 4.67684453142 +Nearest to a: the, ssbn, any, upanija, wct, cegep, another, expired, +Nearest to years: months, four, microcebus, amsterdam, days, hoax, reuptake, happens, +Nearest to may: can, will, would, might, should, could, must, cannot, +Nearest to new: circles, naaman, goo, seam, prague, albury, scientifically, thaler, +Nearest to all: many, some, microcebus, tico, these, various, callithrix, two, +Nearest to used: known, found, dasyprocta, cebus, microcebus, handicap, seen, agouti, +Nearest to called: exchanging, hood, customization, thaler, split, protested, ssbn, ecc, +Nearest to that: which, what, however, this, but, polyhedra, cebus, cegep, +Nearest to three: four, five, six, two, seven, eight, callithrix, nine, +Nearest to the: its, their, callithrix, nordisk, adiabatic, agouti, wct, his, +Nearest to when: after, if, clodius, before, but, because, tamarin, where, +Nearest to its: their, his, the, her, callithrix, agouti, wct, thz, +Nearest to use: polyhedra, ashoka, cegep, callithrix, microcebus, gland, catalysis, clodius, +Nearest to he: she, it, they, who, there, callithrix, iit, already, +Nearest to six: seven, eight, five, four, nine, three, two, callithrix, +Nearest to s: wct, his, dasyprocta, thz, the, was, chalcedon, callithrix, +/usr/lib64/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison + if self._edgecolors == str('face'): + diff --git a/hw11_natural_language_processing/martink_word2vec/text8.zip b/hw11_natural_language_processing/martink_word2vec/text8.zip new file mode 100644 index 0000000000000000000000000000000000000000..436e05b2dffe4f084fdada65374bd7804bdce097 Binary files /dev/null and b/hw11_natural_language_processing/martink_word2vec/text8.zip differ diff --git a/hw11_natural_language_processing/martink_word2vec/tsne.png b/hw11_natural_language_processing/martink_word2vec/tsne.png new file mode 100644 index 0000000000000000000000000000000000000000..a62f3035d1470d3dfa7cf4b116408e1e340bdbcc Binary files /dev/null and b/hw11_natural_language_processing/martink_word2vec/tsne.png differ diff --git a/hw11_natural_language_processing/martink_word2vec/word2vec_basic.py b/hw11_natural_language_processing/martink_word2vec/word2vec_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..c717693a567249ea00c96379e58ba4aeb5ed9f8d --- /dev/null +++ b/hw11_natural_language_processing/martink_word2vec/word2vec_basic.py @@ -0,0 +1,249 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random +import zipfile + +import numpy as np +from six.moves import urllib +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +# Step 1: Download the data. +url = 'http://mattmahoney.net/dc/' + + +def maybe_download(filename, expected_bytes): + """Download a file if not present, and make sure it's the right size.""" + if not os.path.exists(filename): + filename, _ = urllib.request.urlretrieve(url + filename, filename) + statinfo = os.stat(filename) + if statinfo.st_size == expected_bytes: + print('Found and verified', filename) + else: + print(statinfo.st_size) + raise Exception( + 'Failed to verify ' + filename + '. Can you get to it with a browser?') + return filename + +filename = maybe_download('text8.zip', 31344016) + + +# Read the data into a list of strings. +def read_data(filename): + """Extract the first file enclosed in a zip file as a list of words""" + with zipfile.ZipFile(filename) as f: + data = tf.compat.as_str(f.read(f.namelist()[0])).split() + return data + +words = read_data(filename) +print('Data size', len(words)) + +# Step 2: Build the dictionary and replace rare words with UNK token. +vocabulary_size = 50000 + + +def build_dataset(words): + count = [['UNK', -1]] + count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) + dictionary = dict() + for word, _ in count: + dictionary[word] = len(dictionary) + data = list() + unk_count = 0 + for word in words: + if word in dictionary: + index = dictionary[word] + else: + index = 0 # dictionary['UNK'] + unk_count += 1 + data.append(index) + count[0][1] = unk_count + reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) + return data, count, dictionary, reverse_dictionary + +data, count, dictionary, reverse_dictionary = build_dataset(words) +del words # Hint to reduce memory. +print('Most common words (+UNK)', count[:5]) +print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) + +data_index = 0 + + +# Step 3: Function to generate a training batch for the skip-gram model. +def generate_batch(batch_size, num_skips, skip_window): + global data_index + assert batch_size % num_skips == 0 + assert num_skips <= 2 * skip_window + batch = np.ndarray(shape=(batch_size), dtype=np.int32) + labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) + span = 2 * skip_window + 1 # [ skip_window target skip_window ] + buffer = collections.deque(maxlen=span) + for _ in range(span): + buffer.append(data[data_index]) + data_index = (data_index + 1) % len(data) + for i in range(batch_size // num_skips): + target = skip_window # target label at the center of the buffer + targets_to_avoid = [skip_window] + for j in range(num_skips): + while target in targets_to_avoid: + target = random.randint(0, span - 1) + targets_to_avoid.append(target) + batch[i * num_skips + j] = buffer[skip_window] + labels[i * num_skips + j, 0] = buffer[target] + buffer.append(data[data_index]) + data_index = (data_index + 1) % len(data) + return batch, labels + +batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) +for i in range(8): + print(batch[i], reverse_dictionary[batch[i]], + '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) + +# Step 4: Build and train a skip-gram model. + +batch_size = 128 +embedding_size = 128 # Dimension of the embedding vector. +skip_window = 1 # How many words to consider left and right. +num_skips = 2 # How many times to reuse an input to generate a label. + +# We pick a random validation set to sample nearest neighbors. Here we limit the +# validation samples to the words that have a low numeric ID, which by +# construction are also the most frequent. +valid_size = 16 # Random set of words to evaluate similarity on. +valid_window = 100 # Only pick dev samples in the head of the distribution. +valid_examples = np.random.choice(valid_window, valid_size, replace=False) +num_sampled = 64 # Number of negative examples to sample. + +graph = tf.Graph() + +with graph.as_default(): + + # Input data. + train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) + train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) + valid_dataset = tf.constant(valid_examples, dtype=tf.int32) + + # Ops and variables pinned to the CPU because of missing GPU implementation + with tf.device('/cpu:0'): + # Look up embeddings for inputs. + embeddings = tf.Variable( + tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) + embed = tf.nn.embedding_lookup(embeddings, train_inputs) + + # Construct the variables for the NCE loss + nce_weights = tf.Variable( + tf.truncated_normal([vocabulary_size, embedding_size], + stddev=1.0 / math.sqrt(embedding_size))) + nce_biases = tf.Variable(tf.zeros([vocabulary_size])) + + # Compute the average NCE loss for the batch. + # tf.nce_loss automatically draws a new sample of the negative labels each + # time we evaluate the loss. + loss = tf.reduce_mean( + tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, + num_sampled, vocabulary_size)) + + # Construct the SGD optimizer using a learning rate of 1.0. + optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) + + # Compute the cosine similarity between minibatch examples and all embeddings. + norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) + normalized_embeddings = embeddings / norm + valid_embeddings = tf.nn.embedding_lookup( + normalized_embeddings, valid_dataset) + similarity = tf.matmul( + valid_embeddings, normalized_embeddings, transpose_b=True) + + # Add variable initializer. + init = tf.initialize_all_variables() + +# Step 5: Begin training. +num_steps = 100001 + +with tf.Session(graph=graph) as session: + # We must initialize all variables before we use them. + init.run() + print("Initialized") + + average_loss = 0 + for step in xrange(num_steps): + batch_inputs, batch_labels = generate_batch( + batch_size, num_skips, skip_window) + feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} + + # We perform one update step by evaluating the optimizer op (including it + # in the list of returned values for session.run() + _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) + average_loss += loss_val + + if step % 2000 == 0: + if step > 0: + average_loss /= 2000 + # The average loss is an estimate of the loss over the last 2000 batches. + print("Average loss at step ", step, ": ", average_loss) + average_loss = 0 + + # Note that this is expensive (~20% slowdown if computed every 500 steps) + if step % 10000 == 0: + sim = similarity.eval() + for i in xrange(valid_size): + valid_word = reverse_dictionary[valid_examples[i]] + top_k = 8 # number of nearest neighbors + nearest = (-sim[i, :]).argsort()[1:top_k + 1] + log_str = "Nearest to %s:" % valid_word + for k in xrange(top_k): + close_word = reverse_dictionary[nearest[k]] + log_str = "%s %s," % (log_str, close_word) + print(log_str) + final_embeddings = normalized_embeddings.eval() + +# Step 6: Visualize the embeddings. + + +def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): + assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" + plt.figure(figsize=(18, 18)) # in inches + for i, label in enumerate(labels): + x, y = low_dim_embs[i, :] + plt.scatter(x, y) + plt.annotate(label, + xy=(x, y), + xytext=(5, 2), + textcoords='offset points', + ha='right', + va='bottom') + + plt.savefig(filename) + +try: + from sklearn.manifold import TSNE + import matplotlib.pyplot as plt + + tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) + plot_only = 500 + low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) + labels = [reverse_dictionary[i] for i in xrange(plot_only)] + plot_with_labels(low_dim_embs, labels) + +except ImportError: + print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")