以下是我的学习笔记,以及总结,如有错误之处请不吝赐教。
离散表示:one-hot表示、bag of words(TF-IDF)、N-gram;
问题:无法衡量词向量之间的关系、词表维度随着语料库增长膨胀、数据稀疏问题、各种度量(距离、或与非)都不适合。
分布式表示(用一个词附近的其他词来表示该词):word2vec、word-embedding。
skip-gram:
模型特性:无隐层、投隐层也可省略、每个词向量作为log-linear模型的输入
目标函数:
概率密度由softmax给出:
损失函数:
负例采样:P(w|context(w)): 一个正样本,V-1个负样本,对负样本做采样
pytorch实现核心代码:
K = 100 # number of negative samples
C = 3 # nearby words threshold
NUM_EPOCHS = 2 # The number of epochs of training
MAX_VOCAB_SIZE = 30000 # the vocabulary size
BATCH_SIZE = 128 # the batch size
LEARNING_RATE = 0.2 # the initial learning rate
EMBEDDING_SIZE = 100
LOG_FILE = "word-embedding.log"
# tokenize函数,把一篇文本转化成一个个单词
def word_tokenize(text):
return text.split()
with open("./text8/text8.train.txt", "r") as fin:
text = fin.read()
text = [w for w in word_tokenize(text.lower())]
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE-1)) #减一是为了留一个位置给unknow的单词
vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.) #论文中提到的将频率3/4 然后做归一化,对预测准确率有提高
word_freqs = word_freqs / np.sum(word_freqs) # 用来做 negative sampling
VOCAB_SIZE = len(idx_to_word)
class WordEmbeddingDataset(tud.Dataset):
def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
''' text: a list of words, all text from the training dataset
word_to_idx: the dictionary from word to idx
idx_to_word: idx to word mapping
word_freq: the frequency of each word
word_counts: the word counts
'''
super(WordEmbeddingDataset, self).__init__()
self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE-1) for t in text]
self.text_encoded = torch.Tensor(self.text_encoded).long()
self.word_to_idx = word_to_idx
self.idx_to_word = idx_to_word
self.word_freqs = torch.Tensor(word_freqs)
self.word_counts = torch.Tensor(word_counts)
def __len__(self):
''' 返回整个数据集(所有单词)的长度
'''
return len(self.text_encoded)
def __getitem__(self, idx):
''' 这个function返回以下数据用于训练
- 中心词
- 这个单词附近的(positive)单词
- 随机采样的K个单词作为negative sample
'''
center_word = self.text_encoded[idx]
pos_indices = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1)) #window内单词的index
pos_indices = [i%len(self.text_encoded) for i in pos_indices] #取余防止超出text长度
pos_words = self.text_encoded[pos_indices] #周围单词
neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True) #负例采样
return center_word, pos_words, neg_words
class EmbeddingModel(nn.Module):
def __init__(self, vocab_size, embed_size):
''' 初始化输出和输出embedding
'''
super(EmbeddingModel, self).__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
initrange = 0.5 / self.embed_size
self.out_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
self.out_embed.weight.data.uniform_(-initrange, initrange)
self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
self.in_embed.weight.data.uniform_(-initrange, initrange)
def forward(self, input_labels, pos_labels, neg_labels):
'''
input_labels: 中心词, [batch_size]
pos_labels: 中心词周围 context window 出现过的单词 [batch_size * (window_size * 2)]
neg_labelss: 中心词周围没有出现过的单词,从 negative sampling 得到 [batch_size, (window_size * 2 * K)]
return: loss, [batch_size]
'''
batch_size = input_labels.size(0)
input_embedding = self.in_embed(input_labels) # B * embed_size
pos_embedding = self.out_embed(pos_labels) # B * (2*C) * embed_size
neg_embedding = self.out_embed(neg_labels) # B * (2*C * K) * embed_size
log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze() # B * (2*C)
log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze() # B * (2*C*K)
log_pos = F.logsigmoid(log_pos).sum(1)
log_neg = F.logsigmoid(log_neg).sum(1) # batch_size
loss = log_pos + log_neg
return -loss
def input_embeddings(self):
return self.in_embed.weight.data.cpu().numpy()
词嵌入效果评估:
- 词类比任务
- 词相似度任务
- 作为特征用于CRF实体识别