plda源码(五)
model_base.h
存储所有word的topic分布
// The ModelBase class stores topic-word co-occurrence count vectors as
// well as a vector of global topic occurrence counts. The global vector is
// the sum of the other vectors. These vectors are precisely the components of
// an LDA model (as document-topic associations are not true model parameters).
//
// This class supports common operations on this sort of model, primarily in
// the form of assigning new topic occurrences to words, and in reassigning
// word occurrences from one topic to another.
//
// This class is not thread-safe. Do not share an object of this
// class by multiple threads.
template <class T>
class ModelBase {
public:
// An iterator over a LDA model. Returns distributions in an arbitrary
// order. Altering the parent LDA model in any way invalidates the
// iterator, although this is not currently enforced.
class Iterator {//返回词的topic分布的迭代器
public:
// Initializes the iterator for the model specified by parent. parent must
// exist and must not be modified for the lifetime of the iterator.
explicit Iterator(const ModelBase<T>* parent)
: parent_(parent),
iterator_(0) { }
~Iterator() { }
// Advances to the next word.
void Next() {
CHECK(!Done());
++iterator_;
}
// Returns true if we have finished iterating over the model.
bool Done() const {
return iterator_ == parent_->topic_distributions_.size();
}
// Returns the current word.
int Word() const {
CHECK(!Done());
return iterator_;
}
// Returns the current word's distribution.
const TopicDistribution<T>& Distribution() const {
CHECK(!Done());
return parent_->GetWordTopicDistribution(iterator_);
}
private:
const ModelBase<T>* parent_;
int iterator_;
};
friend class Iterator;
ModelBase(int num_topics, const map<string, int>& word_index_map) {//知道topic数量和word_index_map, 初始分布为0
int vocab_size = word_index_map.size();
n_kw_vec.resize(vocab_size);
memory_alloc_.resize((int64)num_topics * ((int64)vocab_size + 1), 0);
// topic_distribution and global_distribution are just accessor pointers
// and are not responsible for allocating/deleting memory.
topic_distributions_.resize(vocab_size);
global_distribution_.Reset(
&memory_alloc_[0] + (int64)vocab_size * num_topics, num_topics);
for (int i = 0; i < vocab_size; ++i) {
topic_distributions_[i] =
TopicDistribution<T>(
&memory_alloc_[0] + (int64)num_topics * i, num_topics);
}
word_index_map_ = word_index_map;
}
// Read word topic distribution and global distribution from iframe.
// Return a map from word string to index. Intenally we use int to represent
// each word.
ModelBase(std::istream& in, map<string, int>* word_index_map) {
//从流中读取topic分布,自己构建word_index_map
word_index_map_.clear();
memory_alloc_.clear();
string line;
while (getline(in, line)) { // Each line is a training document.
if (line.size() > 0 && // Skip empty lines.
line[0] != '\r' && // Skip empty lines.
line[0] != '\n' && // Skip empty lines.
line[0] != '#') { // Skip comment lines.
std::istringstream ss(line);
string word;
double count_float;
CHECK(ss >> word);
while (ss >> count_float) {
memory_alloc_.push_back((T)count_float);
}
int size = word_index_map_.size();
word_index_map_[word] = size;
}
}
SetDistributionData();
*word_index_map = word_index_map_;
}
// Read word topic distribution and global distribution from iframe.
// Only words in the word_index_map are kept.
// Intenally we use int to represent each word.
ModelBase(std::istream& in, const map<string, int>& word_index_map) {//从流中读取topic分布,使用指定word_inde_map
memory_alloc_.clear();
word_index_map_ = word_index_map;
int num_topics = 0;
string line;
while (getline(in, line)) { // Each line is a training document.
if (line.size() > 0 && // Skip empty lines.
line[0] != '\r' && // Skip empty lines.
line[0] != '\n' && // Skip empty lines.
line[0] != '#') { // Skip comment lines.
std::istringstream ss(line);
string word;
double count_float;
CHECK(ss >> word);
if (memory_alloc_.empty()) {
vector<T> temp;
while (ss >> count_float) {
temp.push_back((T)count_float);
}
num_topics = temp.size();
memory_alloc_.resize(
num_topics * (word_index_map_.size()), (T)0);
if (word_index_map_.find(word) != word_index_map_.end()) {
int index = word_index_map_[word];
typename vector<T>::iterator cpit = memory_alloc_.begin();
//advance的英文含义是增加、向前推。顾名思义就是讲迭代器向前推n个位置
advance(cpit, index * num_topics);
copy(temp.begin(), temp.end(), cpit);
}
continue;
}
if (word_index_map_.find(word) == word_index_map_.end())
continue;
int index = word_index_map_[word] * num_topics;
while (ss >> count_float) {
memory_alloc_[index++] = (T)count_float;
}
}
}
SetDistributionData();
}
// Read word topic distribution and global distribution from iframe,extend topic to num_topics.
// Only words in the word_index_map are kept.
// Intenally we use int to represent each word.
ModelBase(std::istream& in, const map<string, int>& word_index_map, int num_topics) {
memory_alloc_.clear();
word_index_map_ = word_index_map;
memory_alloc_.resize(num_topics * (word_index_map_.size()),
(T) 0);
n_kw_vec.resize(word_index_map_.size());
string line;
while (getline(in, line)) { // Each line is a training document.
if (line.size() > 0 && // Skip empty lines.
line[0] != '\r' && // Skip empty lines.
line[0] != '\n' && // Skip empty lines.
line[0] != '#') { // Skip comment lines.
std::istringstream ss(line);
string word;
double count_float;
CHECK(ss >> word);
if (word_index_map_.find(word) == word_index_map_.end())
continue;
int word_id = word_index_map_[word];
int index = word_id * num_topics;
int topic_id = 0;
std::unordered_map<int, int> topic_map;
while (ss >> count_float) {
memory_alloc_[index++] = (T) count_float;
if((int)count_float >= 1) {
topic_map[topic_id] = (int)count_float;
}
topic_id++;
}
n_kw_vec[word_id] = topic_map;
}
}
SetDistributionData();
}
~ModelBase() {}
// Returns the topic distribution for word.
const TopicDistribution<T>& GetWordTopicDistribution(int word) const {
return topic_distributions_[word];
}
// Returns the global topic distribution.
const TopicDistribution<T>& GetGlobalTopicDistribution() const {
return global_distribution_;
}
// Appends more topics.
void AppendTopics(int num_new_topics) {//新增topic,num_new_topics > 0
int num_cur_topics = global_distribution_.size();
int num_topics = num_cur_topics + num_new_topics;
int vocab_size = word_index_map_.size();
memory_alloc_.resize((int64)num_topics * (vocab_size + 1), 0);
global_distribution_.Reset(
&memory_alloc_[0] + (int64)vocab_size * num_topics, num_topics);
for (int i = 0; i < vocab_size; ++i) {
topic_distributions_[i].Reset(
&memory_alloc_[0] + (int64)i * num_topics, num_topics);
}
typename vector<T>::iterator base_iter_src =
memory_alloc_.begin() + (int64)vocab_size * num_cur_topics;
typename vector<T>::iterator base_iter_dst =
memory_alloc_.begin() + (int64)vocab_size * num_topics;
fill_n(base_iter_dst + num_cur_topics, num_new_topics, 0);
copy_backward(base_iter_src, base_iter_src + num_cur_topics,
base_iter_dst + num_cur_topics);
for (int i = vocab_size - 1; i >= 0; --i) {
base_iter_src = memory_alloc_.begin() + (int64)i * num_cur_topics;
base_iter_dst = memory_alloc_.begin() + (int64)i * num_topics;
fill_n(base_iter_dst + num_cur_topics, num_new_topics, 0);
copy_backward(base_iter_src, base_iter_src + num_cur_topics,
base_iter_dst + num_cur_topics);
}
}
// Returns the number of topics in the model.
int num_topics() const { return global_distribution_.size(); }
// Returns the number of words in the model (not including the global word).
int vocab_size() const { return topic_distributions_.size(); }
// Returns the model memory.
const vector<T>& model_memory() const { return memory_alloc_; }
// Output topic_distributions_ into human readable format.
void AppendAsString(std::ostream& out) const {//输出topic分布
vector<string> index_word_map(word_index_map_.size());
for (map<string, int>::const_iterator iter = word_index_map_.begin();
iter != word_index_map_.end(); ++iter) {
index_word_map[iter->second] = iter->first;
}
for (Iterator iter(this); !iter.Done(); iter.Next()) {
out << index_word_map[iter.Word()] << "\t";
for (int topic = 0; topic < num_topics(); ++topic) {
out << iter.Distribution()[topic]
<< ((topic < num_topics() - 1) ? " " : "\n");
}
}
}
protected:
// The dataset which keep all the model memory.
vector<T> memory_alloc_;
// topic_distributions_["word"][k] counts the number of times that
// word "word" and assigned topic k by a Gibbs sampling iteration.
vector<TopicDistribution<T> > topic_distributions_;
// global_distribution_[k] is the number of words in the training
// corpus that are assigned by topic k.
TopicDistribution<T> global_distribution_;
std::vector<std::unordered_map<int, int> > n_kw_vec;
private:
void SetDistributionData() {//设置global_distribution_
int vocab_size = word_index_map_.size();
int num_topics = memory_alloc_.size() / vocab_size;
CheckNumTopics(num_topics, true, true);
memory_alloc_.resize((int64)num_topics * ((int64)vocab_size + 1), 0);
// topic_distribution and global_distribution are just accessor pointers
// and are not responsible for allocating/deleting memory.
topic_distributions_.resize(vocab_size);
global_distribution_.Reset(
&memory_alloc_[0] + (int64)vocab_size * num_topics, num_topics);
for (int i = 0; i < vocab_size; ++i) {
topic_distributions_[i] =
TopicDistribution<T>(&memory_alloc_[0] + num_topics * i, num_topics);
}
for (int i = 0; i < vocab_size; ++i) {
for (int j = 0; j < num_topics; ++j) {
global_distribution_[j] += topic_distributions_[i][j];
}
}
}
map<string, int> word_index_map_;
};