c++ 读取UTF-8编码文本

这个是苏州大学一个有关NLP的选修课的第一个作业，刚开始有点摸不着头脑，随着慢慢接触有点理解了老师的用心。

任务是给定一个给一段汉语文本，将文本切分开为单个character，并在character中间填充上空格，以确认字符识别的效果。

刚开始我是想着把结果从控制台中输出出来，但是靠平常使用的基本库是无法做到这一点的，因为在UTF8编码时，汉字一般需要三个字节，而在GBK编码中中文一般占两个字节。想要识别是汉字还是英文或是数字这个比较容易，问题就在于怎么把识别完的汉字输出出来？如果文本是UTF8编码，我可以很容易地把代表一个汉字的三个字节数据拿出来，但是我无法告诉程序这个汉字是UTF8编码的，你把这三个字节拿去给我按照这个编码找到个汉字回来！如果不调用其他的库，那这基本不太可能做到，所以我换了个思路，不把character从控制台输出了，直接把character输出到文件里，到时候一打开文件就能看到结果对不对，而且由于我每向文件输入一个character都会同时再向文件输入一个空格作为标记，证明这个文本确实是我读取并用空格分隔开的。

对于思路的解释我等会再更新，代码先放上。我总共写了两个读取的思路，有一点小小的不同，大家可以看看。

方法一代码：

//方法1
bool UTF8Reader_Approch_1(string fileName)  
{
	int byte_110 = 3 << 6;
	int byte_mark_110 = 7 << 5;
	int byte_1110 = 7 << 5;
	int byte_mark_1110 = 15 << 4;
	int byte_11110 = 15 << 4;
	int byte_mark_11110 = 31 << 3;
	int byte_111110 = 31 << 3;
	int byte_mark_111110 =63<< 2;
	int byte_1111110 = 63 << 2;
	int byte_mark_1111110 = 127 << 1;

	FILE *fp=NULL;
	fp = fopen(fileName.c_str(),"r");
	FILE *output = NULL;
	output = fopen("ToWrite.txt","w");
	int byte =fgetc(fp);
	while (byte!=EOF)
	{
		if ((byte&byte_mark_110) == byte_110)
		{
			char str[3];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = '\0';
			fprintf(output, "%s ", str);
		}
		else if ((byte&byte_mark_1110) == byte_1110)
		{
			char str[4];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = '\0';
			fprintf(output, "%s ", str);
		}
		else if ((byte&byte_mark_11110) == byte_11110)
		{
			char str[5];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = fgetc(fp);
			str[4] = '\0';
			fprintf(output, "%s ", str);
		}
		else if ((byte&byte_mark_111110) == byte_111110)
		{
			char str[6];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = fgetc(fp);
			str[4] = fgetc(fp);
			str[5] = '\0';
			fprintf(output, "%s ", str);
		}
		else if ((byte&byte_mark_1111110) == byte_1111110)
		{
			char str[7];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = fgetc(fp);
			str[4] = fgetc(fp);
			str[5] = fgetc(fp);
			str[6] = '\0';
			fprintf(output, "%s ", str);
		}
		else
		{
			char str[2];
			str[0] = byte;
			str[1] ='\0';
			fprintf(output, "%s ", str);
		}
		byte =fgetc(fp);
	}
	fclose(fp);
	fclose(output);
	return true;
}

方法二代码：

//方法2
bool UTF8Reader_Approch_2(string fileName)  
{
	FILE *fp = NULL;
	fp = fopen(fileName.c_str(), "r");
	FILE *output = NULL;
	output = fopen("ToWrite.txt", "w");
	if (fp == NULL || output == NULL)
	{
		return false;
	}
	char byte = fgetc(fp);
	unsigned char mask = 255;
	while (byte != EOF)
	{
		int result = mask & byte;
		if (result < 128)  //1字节
		{
			char str[2];
			str[0] = byte;
			str[1] = '\0';
			fprintf(output, "%s ", str);
		}
		else if (result >= 192 && result <= 223)  //2字节
		{
			char str[3];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = '\0';
			fprintf(output, "%s ", str);
		}
		else if (result >= 224 && result <= 239)  //3字节
		{
			char str[4];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = '\0';
			fprintf(output, "%s ", str);
		}
		else if (result >= 240 && result <= 247)  //4字节
		{
			char str[5];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = fgetc(fp);
			str[4] = '\0';
			fprintf(output, "%s ", str);
		}
		else if (result >= 248 && result <= 251)  //5字节
		{
			char str[6];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = fgetc(fp);
			str[4] = fgetc(fp);
			str[5] = '\0';
			fprintf(output, "%s ", str);
		}
		else if (result >= 252 && result <= 253)  //6字节
		{
			char str[7];
			str[0] = byte;
			str[1] = fgetc(fp);
			str[2] = fgetc(fp);
			str[3] = fgetc(fp);
			str[4] = fgetc(fp);
			str[5] = fgetc(fp);
			str[6] = '\0';
			fprintf(output, "%s ", str);
		}
		byte = fgetc(fp);
	}
	fclose(fp);
	fclose(output);
	return true;
}

c++ 读取UTF-8编码文本

猜你喜欢