作者主题模型

作者主题模型ATMODEL

最近一篇关于JAVA的博客中处理的结果，直接用于该python代码运行。
遇到的bug：

1.BUG1

perwordbound = at_model.bound(at_model.corpus, author2doc=at_model.author2doc,
                              doc2author=at_model.doc2author) / corpus_words

ValueError:bound cannot be called with authors not seen during training.
原因：author.txt中存在

湖北大学,2,5,6,8
湖北大学
,9,28
湖北中医药大学
，7,9,56

这样的不规范数据

2.BUG2

for a, a_doc_ids in author2doc.items():
    for i, doc_id in enumerate(a_doc_ids):
        author2doc[a][i] = doc_id_dict[doc_id]

提示author2doc[a][i] = doc_id_dict[doc_id]出错，KeyValue:’ ’
原因：author.txt中存在某行最后以“，”结尾，导致切割时出现空文档，如下面的第一行

中南大学,11,12,
北京大学,15

下面代码有如下问题：

有冗余部分
最后导出的结果sim.csv用Excel打开是乱码，但用Notepad++打开正常，要在Notepad++将编码改为UTF-8-BOM
功能不够完整
对内存不友好，但不知道如何调整目前的文件格式

# -*- coding:utf-8 -*-
import os
import re
from gensim.corpora import Dictionary
from gensim.models import AuthorTopicModel
from gensim.models import atmodel
from pprint import pprint
from sklearn.manifold import TSNE
# from bokeh.io import output_notebook
from bokeh.models import HoverTool
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from gensim import matutils
import pandas as pd
from pandas import DataFrame
import xlrd
import openpyxl
from xlutils.copy import copy
import csv

# 数据整理与形成作者-文档映射表
data_dir = '/AtmodelEnd/testno/'
docs = []
doc_ids = []
files = os.listdir(data_dir)

for filen in files:
    (id1, id2) = re.search('[0-9]+', filen).span()
    doc_ids.append(str(int(filen[id1:id2])))

    with open(data_dir + '/' + filen, errors='ignore', encoding='utf-8') as fid:
        txt = fid.read()
        txt = txt.split()
        docs.append(txt)

# 制作author2doc
author2doc = dict()
i = 0
filename = '/AtmodelEnd/author.txt'

for line in open(filename, errors='ignore', encoding='utf-8'):
    contents = re.sub('\s', '', line)
    author_name = contents.split(',', 1)[0].strip()
    ids = re.split(',', contents)[1:]
    if not author2doc.get(author_name):
        author2doc[author_name] = []
        i += 1
    author2doc[author_name].extend([id for id in ids])

# 在author2doc中使用整数ID
doc_id_dict = dict(zip(doc_ids, range(len(doc_ids))))

for a, a_doc_ids in author2doc.items():
    for i, doc_id in enumerate(a_doc_ids):
        author2doc[a][i] = doc_id_dict[doc_id]
        # print(author2doc[a][i])

# 构建模型语料
dictionary = Dictionary(docs)
max_freq = 0.5
min_wordcount = 2
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
# print('作者数：%d' % len(author2doc))
# print('Number of unique tokens: %d' % len(dictionary))
# print('Number of documents: %d' % len(corpus))

# 模型训练与选择
model_list = []
for i in range(5):
    at_model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, author2doc=author2doc,
                                chunksize=100, passes=100, gamma_threshold=1e-10, eval_every=0, iterations=1,
                                random_state=i)
    top_topics = at_model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((at_model, tc))
# 模型评估：主题一致性
at_model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' % tc)

# 模型评估指标：per-word bound与话题一致性
doc2author = atmodel.construct_doc2author(at_model.corpus, at_model.author2doc)
# Compute the per-word bound.
# Number of words in corpus.
corpus_words = sum(cnt for document in at_model.corpus for _, cnt in document)  # 所有单词包括重复
# Compute bound and divide by number of words.
perwordbound = at_model.bound(at_model.corpus, author2doc=at_model.author2doc,
                              doc2author=at_model.doc2author) / corpus_words
print(perwordbound)
# 话题一致性指标计算 -6.815582939265057
top_topics = at_model.top_topics(at_model.corpus)

# 模型使用,查看每个主题下都有哪些词语,以及概率
for n in range(len(at_model.show_topics(num_topics=10))):
    print(at_model.show_topics(num_topics=10)[n])


tsne = TSNE(n_components=2, random_state=0)
smallest_author = 1  # Ignore authors with documents less than this.
authors = [at_model.author2id[a] for a in at_model.author2id.keys() if len(at_model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(at_model.state.gamma[authors, :])  # Result stored in tsne.embedding_

output_file("at_model.html")
x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [at_model.id2author[a] for a in authors]
scale = 0.1
author_sizes = [len(at_model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
    data=dict(
        x=x,
        y=y,
        author_names=author_names,
        author_sizes=author_sizes,
        radii=radii,
    )
)

# Add author names and sizes to mouse-over info.
hover = HoverTool(
    tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
    ]
)

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)


# def show_author(name):
#     """
#     所有作者的主要文章有哪些，主题有哪些，和对应主题相关度？
#     :param name:
#     :return:上海交通大学
#              Docs: [73, 84, 95]
#              Topics:
#              [(5, 0.9843652032245113)]
#     """
#     dic = {
#         'author': name,
#         'docs': at_model.author2doc[name],
#         'topics': at_model[name]
#     }
#     return dic
#
#
# for n in range(len(author2doc)):
#     authortopics = show_author(author_names[n])
#     authortopics.to_csv(D:/Test/author_topics.csv, encoding='utf-8', mode='a')

"""
def show_author(name):

    所有作者的主要文章有哪些，主题有哪些，和对应主题相关度？
    :param name:
    :return:上海交通大学
             Docs: [73, 84, 95]
             Topics:
             [(5, 0.9843652032245113)]

    print('\n%s' % name)
    print('Docs:', at_model.author2doc[name])
    print('Topics:')
    print(at_model[name])


for n in range(len(author2doc)):
    show_author(author_names[n])
"""

# 作者相似探究 使用的距离是Hellinger distance
# 列出所有作者主题分布
author_vecs = [at_model.get_author_topics(author) for author in at_model.id2author.values()]


def similarity(vec1, vec2):
    """
    :param vec1:
    :param vec2:
    :return: 得到两个向量的相似点
    """
    dist = matutils.hellinger(matutils.sparse2full(vec1, at_model.num_topics),
                              matutils.sparse2full(vec2, at_model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim


def get_sims(vec):
    """
    Get similarity of vector to all authors.
    :param vec:
    :return: 得到所有作者的相似向量
    """
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims


def get_table(name, top_n=10, smallest_author=2):
    """
    Get table with similarities, author names, and author sizes.
    :param name:
    :param top_n:
    :param smallest_author:2改为前面smallest_author
    :return: Return `top_n` authors as a dataframe.
    """
    # Get similarities.
    sims = get_sims(at_model.get_author_topics(name))

    table = []
    for elem in enumerate(sims):
        authorname = at_model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(at_model.author2doc[authorname])
        if author_size >= smallest_author:
            table.append((authorname, sim, author_size))

    # Make dataframe and retrieve检索 top authors.
    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    return df


for j in range(len(author2doc)):
    sims = get_table(author_names[j])
    sims.to_csv('D:/Test/sim.csv', encoding='utf_8_sig', mode='a')

作者主题模型ATMODEL

猜你喜欢