思路:
uft8下的中文是 3个字节,分别是 1110 xxxx 10xx xxxx 10xx xxxx,所以遇到中文的是3个字节读入,以0x80为判断条件。英文字符都是一个字节录入。
函数:
输入:
str:字符串
str_len:转化后的的长度
chnum:中文个数
ennum:英文个数
static int* StringToUnicode(string str,int* str_len,int * chnum,int * ennum)
{
int* result;
int* res;
int en_num = 0;
int ch_num = 0;
int num = 0;
int max_len = 512;
res = (int*)malloc(sizeof(int)*max_len);
memset(res,0,sizeof(int)*max_len);
long int r = 0x0000;
int count = 0;
int ch_index = 0;
for(int i = 0;i<str.size();i++)
{
int s = (int)str.at(i);
//printf("%4x ",(int)str.at(i));
if(s>=0x80)
{
if(ch_index == 0)
{
r = 0x0000;
r += (s-0xe0)<<12;
ch_index ++;
}
else if(ch_index == 1)
{
r += (s-0x80)<<6;
ch_index ++;
}
else if(ch_index == 2)
{
r += (s-0x80);
ch_index = 0;;
ch_num ++ ;
// printf("r = %x \t",r);
res[count%max_len] = r;
count ++;
}
}
else
{
r = 0x0000;
ch_index = 0;
en_num ++ ;
res[count%max_len] = s;
// printf("r = %x \t",s);
count ++;
}
}
//printf("\n");
num = ch_num + en_num;
*str_len = num;
*chnum = ch_num;
*ennum = en_num;
//printf("string len %d chinese num:%d en_num:%d \n",str.size(),ch_num,en_num);
result = (int*)malloc(sizeof(int)*num);
memset(result,0,sizeof(int)*num);
for(int i = 0;i <num;i++)
result[i] = res[i];
free(res);
res = NULL;
return result;
}