版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/nlpzryyclxz/article/details/48047009
由于C语言中没有封装好的Hash(Python 字典)或红黑树(C++ STL map)结构,
因此在这里借用Hash的思想,实现了一个类似Hash的三级字典存储结构。
实现代码如下
/***********************************************************
* File Name : dict.c
* Copyright :
* Module Name : 词典相关操作
*
* CPU : id-2328M CPU @ 2.20GHz
* OS : MicroSoft Windows Xp
*
* Create Date : 2013/07/27
* Author/Corporation : 于飞
*
* Abstract Description :
-----------------Revision Histroy---------------------------
No Version Date Revised By Item Description
************************************************************/
#include "dict.h"
/***********************************************************
* Function Name : Init_Dict_List
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 初始化词典存储数据结构
* Param :
* Return Code : 0成功,-1失败
************************************************************/
extern int Init_Dict_List(void)
{
int i = 0;
FollowChar *s;
for(i=0;i<65536;i++)
{
s =(FollowChar *)malloc(sizeof(FollowChar));
if(s != NULL)
{
s->Next = NULL;
s->priority = 0;
s->value = 0;
s->Down = NULL;
address[i].Next = s;
address[i].priority = 0;
}
else
{
printf("没有内存啦");
return -1;
}
}
return 0;
}
/***********************************************************
* Function Name : Create_Dict
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 创建词典
* Param : fp:语料库指针
* Return Code : 0成功
************************************************************/
extern unsigned int Create_Dict(FILE *fp)
{
char wbuffer[100] = "\0";
char tbuffer[5] = "\0";
unsigned int cnt = 0;
FollowChar *stp1;
FollowWords *stp2;
double wordcount = 0;
Init_Dict_List();
while(cnt = Read_a_Word(fp,wbuffer,tbuffer))
{
if(cnt == 3)
{
QuAndIn_First_List(&wbuffer[0],1);
}
else if (cnt == 5)
{
stp1 = QuAndIn_First_List(&wbuffer[0],0);
QuAndIn_Second_List(stp1,&wbuffer[2],1);
}
else
{
stp1 = QuAndIn_First_List(&wbuffer[0],0);
stp2 = QuAndIn_Second_List(stp1,&wbuffer[2],0);
QuAndIn_Third_List(stp2,&wbuffer[4],cnt-4);
//插入到第三级表
}
}
printf("创建双字哈希词典中...");
printf("词典创建完毕!\n");
wordcount = Word_Cnt_Statistic();
Wordcnt_To_Wordpro(wordcount);
return 0;
}
/***********************************************************
* Function Name : QuAndIn_First_List
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 查询一级哈希,对词登记
* Param : arr : 要存入词典的词数组
flag: 单字时flag=1,计数
多余单字flag =0;
* Return Code : 返回所连接的二级哈希头
************************************************************/
extern FollowChar *QuAndIn_First_List(char arr[],int flag)
{
unsigned short int s = 0;
s = (unsigned char)arr[0];
s = s<<8;
s += (unsigned char)arr[1];
if(flag)
{
address[s].priority++;
}
return address[s].Next;
}
/***********************************************************
* Function Name : QuAndIn_Second_List
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 查询二级哈希,对词登记
* Param : p : 二级哈希头地址
arr : 要存入词典的词数组
flag: 双字时flag=1,计数;
多余双字字flag =0;
* Return Code : 返回连接的三级哈希地址
************************************************************/
extern FollowWords *QuAndIn_Second_List(FollowChar *p,char arr[],int flag)
{
unsigned short int s = 0;
FollowChar *tp = p;
FollowChar *tq;
FollowWords *tu;
s = (unsigned char)arr[0];
s = s<<8;
s += (unsigned char)arr[1];
if(p->Next )
{
while(p = p->Next)
{
if(s > p->value )
{
break;
}
else if (s == p->value)
{
if(flag) p->priority++;
return p->Down;
}
else
{
tp = p;
}
}
}
p = tp;
tq = (FollowChar *)malloc(sizeof(FollowChar));
if(tq !=NULL)
{
if(flag)
{
tq->priority = 1;
}
else
{
tq->priority = 0;
}
tu = (FollowWords *)malloc(sizeof(FollowWords));
if(tu!=NULL)
{
tu->Next = NULL;
tu->priority = 0;
tu->wordptr = NULL;
tq->Down =tu;
}
else
{
printf("没有内存啦!");
}
tq->value = s;
tq->Next = p->Next ;
p->Next = tq;
}
else
{
printf("没有内存啦!");
}
return tq->Down;
}
/***********************************************************
* Function Name : QuAndIn_Third_List
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 查询三级哈希,对词登记
* Param : p : 三级哈希头地址
arr : 要存入词典的词数组
num : 词组长度
* Return Code : 返回连接的三级哈希地址
************************************************************/
extern FollowWords *QuAndIn_Third_List(FollowWords *p,char arr[],unsigned int num)
{
FollowWords *s;
char *aw;
unsigned int j = 0;
while(p->Next )
{
p = p->Next ;
if(strcmp(p->wordptr,arr) == 0 )
{
p->priority ++;
return p;
}
}
s = (FollowWords *)malloc(sizeof(FollowWords));
if(s != NULL)
{
s->priority = 1;
aw = (char *)malloc(num*sizeof(char));
if(aw ==NULL)
{
printf("内存不够啦!");
return NULL;
}
for(j=0;j<num;j++)
{
*(aw+j) = arr[j];
}
s->wordptr = aw;
}
else
{
printf("没有内存啦!");
return NULL;
}
s->Next = p->Next ;
p->Next = s;
return p->Next ;
}
/***********************************************************
* Function Name : Word_Cnt_Statistic
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 统计语料集中词语总数
* Param :
* Return Code : 返回词语总数
************************************************************/
extern double Word_Cnt_Statistic(void)
{
unsigned int i = 0;
double cnt = 0;
FollowChar *p1;
FollowWords *p2;
for(i=0;i<65536;i++)
{
cnt += address[i].priority ;
p1 = address[i].Next ;
while(p1 = p1->Next)
{
cnt += p1->priority ;
p2 = p1->Down ;
while(p2 = p2->Next )
{
cnt += p2->priority ;
}
}
}
return cnt;
}
/***********************************************************
* Function Name : Wordcnt_To_Wordpro
* Create Date : 2013/07/27
* Author/Corporation : 于飞
* Description : 将每种词的出现次数转化为词频
* Param :
* Return Code :
************************************************************/
extern void Wordcnt_To_Wordpro(double cnt)
{
unsigned int i = 0;
FollowChar *p1;
FollowWords *p2;
double temppri = 0;
for(i=0;i<65536;i++)
{
if(address[i].Next->Next != NULL)
{
if((temppri = address[i].priority/cnt) == 0)
{
temppri = DBL_MIN;
}
address[i].priority = -log(temppri);
p1 = address[i].Next ;
while(p1 = p1->Next)
{
p1->priority = -log(p1->priority /cnt);
p2 = p1->Down ;
while(p2 = p2->Next )
{
p2->priority = -log(p2->priority /cnt);
}
}
}
}
}
extern void Destroy_Dict(void)
{
unsigned int i = 0;
FollowChar *p1;
FollowChar *tp1;
FollowWords *p2;
FollowWords *tp2;
for(i=0;i<65536;i++)
{
p1 = address[i].Next;
tp1 = p1->Next;
while(p1 = tp1)
{
p2 = p1->Down;
tp2 = p2->Next;
while(p2 = tp2)
{
free(p2->wordptr);
tp2 = p2->Next;
free(p2);//删除第三级链表
}
free(p1->Down);//删除第三级链表头
tp1 = p1->Next;
free(p1);//删除第二级链表
}
free(address[i].Next);//删除第二级链表头
}
}