

#在开头加上from __future__ import print_function这句之后,即使在python2.X,使用print就得像python3.X那样加括号使用。
from __future__ import print_function   
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
print('check:libs well prepared')


url = 'http://mattmahoney.net/dc/'

def maybe_download(filename,expected_bytes):
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename,filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
        print('exception %s' % statinfo.st_size)
    return filename

filename = maybe_download('text8.zip',31244016)

执行后输出:exception 31344016

def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        #tf.compat.as_str 数据转单词列表
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data size %d' % len(words))

执行后输出:Data size 17005207

vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word,_ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
            index = 0  # dictionary['UNK']
            unk_count += 1
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary 

data, count, dictionary, reverse_dictionary = build_dataset(words)

print('Most common words (+UNK)', count[:5])
print('original data', words[:10])
print('training data', data[:10])

Most common words (+UNK) [[‘UNK’, 418391], (‘the’, 1061396), (‘of’, 593677), (‘and’, 416629), (‘one’, 411764)]
original data [‘anarchism’, ‘originated’, ‘as’, ‘a’, ‘term’, ‘of’, ‘abuse’, ‘first’, ‘used’, ‘against’]
training data [5236, 3082, 12, 6, 195, 2, 3137, 46, 59, 156]


def generate_batch(batch_size,num_skips,skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    #x y、
    batch = np.ndarray(shape=(batch_size),dtype = np.int32)
    labels = np.ndarray(shape=(batch_size,1),dtype = np.int32) 
    #窗的大小,为3,结构为[ skip_window target skip_window ] 
    span = 2*skip_window + 1
    buffer = collections.deque(maxlen=span) 
    for _ in range(span):
        # 循环使用
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        data_index = (data_index + 1) % len(data)
    return batch, labels
print('data:', [reverse_dictionary[di] for di in data[:8]])
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=2)
print('batch:', [reverse_dictionary[bi] for bi in batch])
print('labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: [‘anarchism’, ‘originated’, ‘as’, ‘a’, ‘term’, ‘of’, ‘abuse’, ‘first’]
batch: [‘as’, ‘as’, ‘a’, ‘a’, ‘term’, ‘term’, ‘of’, ‘of’]
labels: [‘term’, ‘anarchism’, ‘as’, ‘term’, ‘abuse’, ‘of’, ‘abuse’, ‘term’]


batch_size = 128
embedding_size = 128    #Dimension of the embedding vector.
skip_window = 1         # How many words to consider left and right.
num_skips = 2           # How many times to reuse an input to generate a label
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16         # Random set of words to evaluate similarity on.
valid_window = 100      # Only pick dev samples in the head of the distribution.
valid_example = np.array(random.sample(range(valid_window),valid_size))
num_sampled = 64        # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(),tf.device('/cpu:0'):
         # 输入一个batch的训练数据,是当前单词在字典中的索引id
    train_dataset = tf.placeholder(tf.int32,shape=[batch_size])
         # 输入一个batch的训练数据的标签,是当前单词前一个或者后一个单词在字典中的索引id
    train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset = tf.constant(valid_example,tf.int32)
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    softmax_weight = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
    softmax_bisase = tf.Variable(tf.zeros([vocabulary_size]))
    embed = tf.nn.embedding_lookup(embeddings,train_dataset)
    #batch loss
    # Compute the average loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels eachtime we evaluate the loss.

    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weight,
                                                     num_sampled = num_sampled,
    optimizer = tf.train.AdamOptimizer(1.0).minimize(loss)
    #调用reduce_sum(arg1, arg2)时,参数arg1即为要求和的数据,arg2有两个取值分别为0和1,通常用reduction_indices=[0]或
    #reduction_indices=[1]来传递参数。从上图可以看出,当arg2 = 0时,是纵向对矩阵求和,原来矩阵有几列就得到几个值;相
    #似地,当arg2 = 1时,是横向对矩阵求和;当省略arg2参数时,默认对矩阵所有元素进行求和。
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
    similarity = tf.matmul(valid_embeddings,tf.transpose(normalized_embeddings))


num_steps = 100000
with tf.Session(graph=graph) as session:
    # Add variable initializer.
    average_loss = 0
    for step in range(num_steps+1):
        batch_data,batch_labels = generate_batch(batch_size,num_skips,skip_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _,loss_val = session.run([optimizer,loss],feed_dict = feed_dict)
        average_loss += loss_val
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_example[i]]
                top_k = 5
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
        final_embeddings = normalized_embeddings.eval()


Average loss at step 0: 8.377776
Nearest to new: reformer, inventor, interrupts, cranmer, shih,
Nearest to may: misandry, gaius, supplement, inappropriate, kaposi,
Nearest to he: glycogen, iic, cuzco, cranmer, deeds,
Nearest to be: overload, liquidity, litt, lux, sirens,
Nearest to in: norman, still, wa, arctocephalus, nitrogenous,
Nearest to b: separable, jurisprudence, flo, endogamous, aviator,
Nearest to eight: appropriating, curl, blogging, comecon, vikernes,
Nearest to who: topper, sena, disprove, capitoline, netsplit,
Nearest to known: prix, attributes, accelerates, excused, part,
Nearest to but: basayev, valens, landon, dojos, masking,
Nearest to also: sword, suffolk, nl, augmenting, ipcc,
Nearest to six: transitory, gras, championing, misuse, acoustical,
Nearest to will: potsdam, fangio, roskilde, obtainable, surpassed,
Nearest to many: paradiso, polysaccharides, adolphus, krone, framing,
Nearest to into: bruun, interim, dns, attractors, hangings,
Nearest to to: dimers, womanizer, unemployment, hoo, necessitate,
Average loss at step 2000: 5653.813801
Average loss at step 4000: 10679.583946
Average loss at step 6000: 12639.168628
Average loss at step 8000: 13891.868634
Average loss at step 10000: 14616.583857
Nearest to new: financial, write, phospholipids, fta, narbonne,
Nearest to may: phallic, supervises, catches, paw, specification,
Nearest to he: capitalised, macao, exhibited, mathrm, commemorates,
Nearest to be: feininger, lyrics, lps, jenny, fermentation,
Nearest to in: powder, c, daniel, compounds, duplicating,
Nearest to b: script, igbo, grace, collection, marlowe,
Nearest to eight: edmonton, jennie, de, nims, lucretia,
Nearest to who: writeup, vocally, fuller, banach, responded,
Nearest to known: actions, observances, pistols, wherein, wedding,
Nearest to but: drummer, dime, passion, glowing, miniature,
Nearest to also: mechanism, replicating, domesticated, euston, vindication,
Nearest to six: physik, parted, election, predominantly, ca,
Nearest to will: worsen, derives, allophones, pseudopods, jermaine,
Nearest to many: individualists, elders, living, montenegro, location,
Nearest to into: uses, lay, albums, astronaut, approximated,
Nearest to to: denounced, exhibited, boomerangs, topics, wept,
Average loss at step 12000: 15368.244784


num_points = 400
#init:字符串,可选(默认值:“random”)嵌入的初始化。可能的选项是“随机”和“pca”。 PCA初始化不能用于预先计算的距离,并且通常比随机初始化更全局稳定。
tsne = TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_point+1,:])
def plot(embeddings,labels):
    assert embeddings.shape[0] >= len(labels),'More labels than embeddings'
    # in inches
    for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)


