Google原生输入法LatinIME词库构建流程分析(二)

在Google原生输入法LatinIME词库构建流程分析(一) 中分析LatinIME构建流程进行到了dict_trie->dict_list_->init_list这一步，然后就是构建N-gram信息了，N-gram构建过程在Google原生输入法LatinIME词库构建流程分析(三)--N-gram信息构建中进行了分析，那么接下来继续：

bool DictBuilder::build_dict(const char *fn_raw,
                             const char *fn_validhzs,
                             DictTrie *dict_trie) {  
...
// Construct the NGram information
  NGram& ngram = NGram::get_instance();
  ngram.build_unigram(lemma_arr_, lemma_num_,
                      lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
    
  //按照spl_idx_arr排序，id一样的话按照freq字段排序（compare_py）
  // sort the lemma items according to the spelling idx string
  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);

  get_top_lemmas();

#ifdef ___DO_STATISTICS___
  stat_init();
#endif

  lma_nds_used_num_le0_ = 1;  // The root node
  bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
                                     lemma_arr_, 0, lemma_num_, 0);
  if (!dt_success) {
    free_resource();
    return false;
  }
...
}

myqsort这句是对lemma_arr数组进行排序，排序规则为：按照spl_idx_arr先进行比较，如果相等，按照freq字段来排序，接下来调用get_top_lemmas()来初始化数组top_lmas_数组，数组长度为10，这里的top是指按照freq字段从大到小的前10个元素：

{{idx_by_py = 0, idx_by_hz = 8505, hanzi_str = {30340, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {8508, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {91, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "DE\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 4828294.5}, {idx_by_py = 0, idx_by_hz = 114, hanzi_str = {20102, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {114, 0, 0, 0, 0, 0, 0, 0}, 
    spl_idx_arr = {200, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"LE\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 1500186}, {idx_by_py = 0, idx_by_hz = 4196, hanzi_str = {25105, 0, 0, 0, 0, 0, 0, 
      0, 0}, hanzi_scis_ids = {4198, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {375, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"WO\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 1192789.25}, {
    idx_by_py = 0, idx_by_hz = 5084, hanzi_str = {26159, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {5087, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {338, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "ShI\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 1180957}, {idx_by_py = 0, idx_by_hz = 1955, hanzi_str = {22312, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {1957, 0, 0, 0, 0, 0, 0, 0}, 
    spl_idx_arr = {407, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"ZAI\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 974740.062}, {idx_by_py = 0, idx_by_hz = 308, hanzi_str = {20320, 0, 0, 0, 0, 0, 
      0, 0, 0}, hanzi_scis_ids = {308, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {251, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"NI\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 973526.125}, {
    idx_by_py = 0, idx_by_hz = 1406, hanzi_str = {21644, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {1407, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {148, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "HE\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 664874.125}, {idx_by_py = 0, idx_by_hz = 5254, hanzi_str = {26377, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {5257, 0, 0, 0, 0, 0, 0, 0}, 
    spl_idx_arr = {401, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"YOU\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 613906.75}, {idx_by_py = 0, idx_by_hz = 13, hanzi_str = {19981, 0, 0, 0, 0, 0, 0, 
      0, 0}, hanzi_scis_ids = {13, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {50, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"BU\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 590643.062}, {
    idx_by_py = 0, idx_by_hz = 2961, hanzi_str = {23601, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {2963, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {171, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "JIU\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 558432.875}}

top_lmas_数组初始化完成后调用stat_init()函数来初始化下一步(construct_subset)需要用到的一些数据结构,stat_init:

#ifdef ___DO_STATISTICS___
void DictBuilder::stat_init() {
  memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);

  sonbufs_num1_ = 0;
  sonbufs_numgt1_ = 0;
  total_lma_node_num_ = 0;
}

很明显，这里设置相关数组元素和变量为0从而完成相关数据结构的初始化操作。重点逻辑在接下来的construct_subset（）方法中，

Google原生输入法LatinIME词库构建流程分析(二)

猜你喜欢