版权声明:我是南七小僧,微信: to_my_love ,寻找人工智能相关工作,欢迎交流思想碰撞。 https://blog.csdn.net/qq_25439417/article/details/83747895
使用步骤:
1.实例化Tokenizer对象,给出最大词汇量nb_words
2.用tokenizer令牌化所有文章,把文章包装成 list(list())的形式,词或者字用空格分割
3.tokenizer.word_index会输出所有词汇与index--》也就是词表【切记如果词汇中包含大写字母,会被转成小写,后面做初始化embedding的时候,切记要转成大写】
4.embedding matrix初始化的时候,要按照词表的index来排序,采用循环来做,这个时候 要用dictionary的get方法来取values,因为用【】数组的形式,如果词汇不在词表中,会报错。当然如果try catch也是可以的。
def get_train_test_data_embeddingweights():
input_1_list,input_2_list,label_list = get_input_and_label_list()
qlist=[]
qcontentlist=[]
tokenizer = Tokenizer(nb_words=voc_size)
token_dict = {}
# question_content_matrix={}
with open('question_id.csv','r',encoding='utf-8') as f:
content_list = f.readlines()
for i in content_list:
values = i.split(',')
# print(values)
qid = values[0]
qlist.append(qid)
qcontentlist.append(values[1])
tokenizer.fit_on_texts(qcontentlist)
sequences = tokenizer.texts_to_sequences(qcontentlist)
# token_dict_for_emb = tokenizer.word_index.items()
print(tokenizer.word_index)
# print(tokenizer.word_index)
embedding_matrix = all_embedding_dict
embed_train_matrix = np.zeros((voc_size+1,300))
# print(embedding_matrix)
# print(embedding_matrix['W107878'])
for w,i in tokenizer.word_index.items():
# print(str(w))
# print(embedding_matrix[str(w)])
embedding_vector=embedding_matrix.get(w.upper())
if embedding_vector is not None:
embed_train_matrix[i] = embedding_vector
# print(embedding_matrix.get(w))
data = pad_sequences(sequences,maxlen=max_sequence_len)
for j in range(len(content_list)):
token_dict[content_list[j].split(',')[0]]=data[j]
x1_train_list = []
x2_train_list = []
y_list = []
for i1 in range(len(input_1_list)):
x1_train_list.append(token_dict[input_1_list[i1]])
x2_train_list.append(token_dict[input_2_list[i1]])
y_list.append(label_list[i1])
x1_train_list = np.array(x1_train_list)
x2_train_list = np.array(x2_train_list)
y_list = np.array(y_list)
# #人工打乱
# indices = np.arange(len(x1_train_list))
# np.random.shuffle(indices)
# print(indices)
# x1_train_list = x1_train_list[indices]
# x2_train_list = x2_train_list[indices]
# y_list = y_list[indices]
# val_split=0.8
#
# x_1_train = x1_train_list[:int(val_split*x1_train_list.shape[0])]
# x_1_test = x1_train_list[int(val_split*x1_train_list.shape[0]):]
# x_2_train = x2_train_list[:int(val_split*x2_train_list.shape[0])]
# x_2_test = x2_train_list[int(val_split*x2_train_list.shape[0]):]
# y_train = y_list[:int(val_split*y_list.shape[0])]
# y_test = y_list[int(val_split*y_list.shape[0]):]
return x1_train_list,x2_train_list,y_list,embed_train_matrix
这里贴一些官方文档:
、