C#写中文基于词表的最大逆向匹配分词算法

概念和原理可以参考中文分词基础原则及正向最大匹配法、逆向最大匹配法、双向最大匹配法的分析
using System;
using System.Linq;
using System.Text;
using System.IO;
using System.Data.OleDb;


namespace Segamentation1
{
    class Program
    {
        static void Main(string[] args)
        {

            //{读入txt文本}
            string filePath = @"D:\\Work\\Project\\2012.11.15北京：习总书记会见出席党的十八大代表、特邀代表和列席人员时的重要讲话.txt";
            //读入原始语料
            FileStream fs = File.Open(filePath, FileMode.OpenOrCreate);//用filestream打开一个文本流
            StreamReader sr = new StreamReader(fs, Encoding.Default);//从流中构造StreamReader类实例
            while (!sr.EndOfStream)
            {
                Console.WriteLine(sr.ReadLine());
            }
            sr.Close();
            StreamReader sr2 = new StreamReader(filePath, Encoding.Default);
            string txt;
            Console.WriteLine();
            Console.WriteLine("===================================");
            while ((txt = sr2.ReadLine()) != null)
            //按行处理
            {
                Console.WriteLine();
                //匹配数据库
                string segastr, segasql;
                OleDbConnection segaconn = new OleDbConnection();
                OleDbCommand segacomm = new OleDbCommand();
                segastr = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=D:\\Work\\Project\\words.mdb";
                //读入词表
                segaconn.ConnectionString = segastr;
                segaconn.Open();
                string fircut = txt;
                char[] EveryTxt = fircut.ToArray();
                string output = "", seccut = null;
                int cutlen = 8, c = EveryTxt.Length - 8;
                for (int a = 0; a < a + 1; a++)
                {
                    if (c == 0 - cutlen)
                    {
                        break;
                    }
                    for (int b = 0; b < cutlen; b++)
                    {
                        if (c >= 0)
                        {
                            seccut += EveryTxt[c];
                        }
                        c++;
                        if (b == cutlen - 1)/*当b循环第七次的时候seccut内有8个字符，c指向最后一个字符的数组下标，c-8实现使c重新指向seccut的第一个字符的下标*/
                        {
                            c -= cutlen;
                        }
                    }
                    if (seccut.Length < cutlen)
                    {
                        c = 0;
                    }
                    for (int e = 0; e < e + 1; e++)
                    {
                        segasql = "SELECT  word FROM words WHERE word='" + seccut + "'";
                        //构造SQL检索式
                        segacomm.CommandText = segasql;
                        segacomm.Connection = segaconn;
                        if (segacomm.ExecuteScalar() != null)
                        {
                            c -= cutlen;
                            seccut = '/' + seccut;
                            output = output.Insert(0, seccut);
                            seccut = null;
                            break;
                        }
                        if (seccut.Length == 1)
                        {
                            seccut = '/' + seccut;
                            output = output.Insert(0, seccut);
                            seccut = null;
                            c -= cutlen;
                            break;
                        }
                        else if (seccut.Length != 0)
                        {
                            seccut = seccut.Substring(1, seccut.Length - 1);
                            c++;
                        }
                    }

                }
                Console.WriteLine(output);
                //打印结果
                segaconn.Close();
            }
            Console.WriteLine("The end!");
        }
    }
}
C#写中文基于词表的最大逆向匹配分词算法

猜你喜欢