参考源码来自
https://github.com/NELSONZHAO/zhihu/tree/master/skip_gram
导入包、文件以及文本清洗
import time
import numpy as np
import tensorflow as tf
import random
from collections import Counter
with open('data.txt') as f:
text = f.read()
def preprocess(text, freq=5):
'''
对文本进行预处理
参数
---
text: 文本数据
freq: 词频阈值
'''
# 对文本中的符号进行替换
text = text.lower()
text = text.replace('.', ' <PERIOD> ')
text = text.replace(',', ' <COMMA> ')
text = text.replace('"', ' <QUOTATION_MARK> ')
text = text.replace(';', ' <SEMICOLON> ')
text = text.replace('!', ' <EXCLAMATION_MARK> ')
text = text.replace('?', ' <QUESTION_MARK> ')
text = text.replace('(', ' <LEFT_PAREN> ')
text = text.replace(')', ' <RIGHT_PAREN> ')
text = text.replace('--', ' <HYPHENS> ')
text = text.replace('?', ' <QUESTION_MARK> ')
text = text.replace(':', ' <COLON> ')
words = text.split()
# 删除低频词,减少噪音影响
word_counts = Counter(words)
trimmed_words = [word for word in words if word_counts[word] > freq]
return trimmed_words
words = preprocess(text)
print(words[:20])
当前结果图
#构建映射表
vocab = set(words)
vocab_to_int = {w: c for c, w in enumerate(vocab)}
int_to_vocab = {c: w for c, w in enumerate(vocab)}
print("total words: {}".format(len(words)))
print("unique words: {}".format(len(set(words))))
结果