版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/AlanConstantineLau/article/details/69484364
概念和原理可以参考中文分词基础原则及正向最大匹配法、逆向最大匹配法、双向最大匹配法的分析
using System;
using System.Linq;
using System.Text;
using System.IO;
using System.Data.OleDb;
namespace Segamentation1
{
class Program
{
static void Main(string[] args)
{
//{读入txt文本}
string filePath = @"D:\\Work\\Project\\2012.11.15北京:习总书记会见出席党的十八大代表、特邀代表和列席人员时的重要讲话.txt";
//读入原始语料
FileStream fs = File.Open(filePath, FileMode.OpenOrCreate);//用filestream打开一个文本流
StreamReader sr = new StreamReader(fs, Encoding.Default);//从流中构造StreamReader类实例
while (!sr.EndOfStream)
{
Console.WriteLine(sr.ReadLine());
}
sr.Close();
StreamReader sr2 = new StreamReader(filePath, Encoding.Default);
string txt;
Console.WriteLine();
Console.WriteLine("===================================");
while ((txt = sr2.ReadLine()) != null)
//按行处理
{
Console.WriteLine();
//匹配数据库
string segastr, segasql;
OleDbConnection segaconn = new OleDbConnection();
OleDbCommand segacomm = new OleDbCommand();
segastr = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=D:\\Work\\Project\\words.mdb";
//读入词表
segaconn.ConnectionString = segastr;
segaconn.Open();
string fircut = txt;
char[] EveryTxt = fircut.ToArray();
string output = "", seccut = null;
int cutlen = 8, c = EveryTxt.Length - 8;
for (int a = 0; a < a + 1; a++)
{
if (c == 0 - cutlen)
{
break;
}
for (int b = 0; b < cutlen; b++)
{
if (c >= 0)
{
seccut += EveryTxt[c];
}
c++;
if (b == cutlen - 1)/*当b循环第七次的时候seccut内有8个字符,c指向最后一个字符的数组下标,c-8实现使c重新指向seccut的第一个字符的下标*/
{
c -= cutlen;
}
}
if (seccut.Length < cutlen)
{
c = 0;
}
for (int e = 0; e < e + 1; e++)
{
segasql = "SELECT word FROM words WHERE word='" + seccut + "'";
//构造SQL检索式
segacomm.CommandText = segasql;
segacomm.Connection = segaconn;
if (segacomm.ExecuteScalar() != null)
{
c -= cutlen;
seccut = '/' + seccut;
output = output.Insert(0, seccut);
seccut = null;
break;
}
if (seccut.Length == 1)
{
seccut = '/' + seccut;
output = output.Insert(0, seccut);
seccut = null;
c -= cutlen;
break;
}
else if (seccut.Length != 0)
{
seccut = seccut.Substring(1, seccut.Length - 1);
c++;
}
}
}
Console.WriteLine(output);
//打印结果
segaconn.Close();
}
Console.WriteLine("The end!");
}
}
}