原文地址为:
ICTCLAS.NET——给C/C++程序写.NET wrapper
转载请注明本文地址: ICTCLAS.NET——给C/C++程序写.NET wrapper
很多时候想通过.NET调用一些C/C++写的库,但是一直都不知道怎么弄。去网上找了一些资料,大多数是教如何通过托管C++和非托管C++的混合编程来完成C/C++的的类库的.NET Wrapper。
有的时候用C#来实现一个功能的时候,可能要调用windows api,往往都是到网上现查代码,然后粘过来使用,没有细研究到底是怎么做到的。
最近一个朋友用到分词,所以就研究了一些中科院提供的中文分词软件,详情请访问http://ictclas.org/。用了一下还挺好用的,速度没有测试,感觉应该用于学术研究肯定是没有问题的,如果要用到项目中,我觉得还是公司自己实现会比较好。
可惜该组件没有提供.NET的版本的,只有C++的和Java版本的(java版也是通过调用本地c语言的版本)。给的开发包中有一个编译好的dll库。
想想之前调用windows api的时候,不正是从一些系统的dll中导入函数,然后再通过C#代码进行调用的吗?想到这里,我就觉得我可以通过导入该分词dll中的函数用C#来完成改程序的wrapper。
说干就干,我试着导入了几个简单的函数,发现可以调用,感觉非常好,然后花了一段的时间给这个库写了.NET Wrapper,方便自己以后用C#调用该接口来分词。
核心代码如下,用单子模式实现,感觉设计得不是很好,不知道各位有没有什么建议?
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
namespace ICTCLAS.NET
{
//////////////////////////////////////////////////////////////////////// //
// character coding types
//////////////////////////////////////////////////////////////////////// //
public enum ECodeType
{
CODE_TYPE_UNKNOWN, // type unknown
CODE_TYPE_ASCII, // ASCII
CODE_TYPE_GB, // GB2312,GBK,GB10380
CODE_TYPE_UTF8, // UTF-8
CODE_TYPE_BIG5 // BIG5
}
public enum EPosTag
{
/// <summary>
/// 计算所二级标注集
/// </summary>
ICT_POS_MAP_SECOND = 0,
/// <summary>
/// 计算所一级标注集
/// </summary>
ICT_POS_MAP_FIRST = 1,
/// <summary>
/// 北大二级标注集
/// </summary>
PKU_POS_MAP_SECOND = 2,
/// <summary>
/// 北大一级标注集
/// </summary>
PKU_POS_MAP_FIRST = 3,
/// <summary>
/// 标注集 数量
/// </summary>
POS_MAP_NUMBER = 4,
/// <summary>
/// 词性标记最大字节数
/// </summary>
POS_SIZE = 8
}
[StructLayout(LayoutKind.Explicit)]
struct result_t
{
[FieldOffset( 0)]
public int start;
[FieldOffset( 4)]
public int length;
[FieldOffset( 8)]
public int sPos;
[FieldOffset( 12)]
public int sPosLow;
[FieldOffset( 16)]
public int POS_id;
[FieldOffset( 20)]
public int word_ID;
[FieldOffset( 24)]
public int word_type;
[FieldOffset( 28)]
public int weight;
}
public struct Word
{
public string str;
public int pos_id;
public int word_id;
public int weight;
public int word_type;
}
public class WordSegger
{
class Nested
{
static Nested()
{
}
internal static readonly WordSegger instance = new WordSegger();
}
private static object lockobj = new object();
private static bool inited = false;
/// <summary>
/// 获取分词器,第一次调用需要提供path参数
/// 以后调用不传参数即可
/// </summary>
/// <param name="path"> 指定配置文件和data文件位置 </param>
/// <returns></returns>
public static WordSegger GetInstance( string path = "")
{
if (inited)
{
return Nested.instance;
}
lock (lockobj)
{
if (!inited)
{
inited = ICTCLAS_Init(path);
if (!inited)
{
return null;
}
}
return Nested.instance;
}
}
/// <summary>
/// 主动释放切词所占的托管资源
/// </summary>
public static void Release()
{
if (inited)
{
lock (lockobj)
{
if (inited)
{
ICTCLAS_Exit();
inited = false;
}
}
}
}
/// <summary>
/// 构建切词对象
/// </summary>
private WordSegger()
{
}
/// <summary>
/// 析构函数,主动调用释放资源
/// </summary>
~WordSegger()
{
Release();
}
/// <summary>
/// 切词
/// </summary>
/// <param name="str"> 需要切词的字符串 </param>
/// <param name="ecode"> 编码 </param>
/// <param name="posTagged"> 是否进行词性标注 </param>
/// <returns></returns>
public Word[] SegStr( string str, ECodeType ecode, bool posTagged = false)
{
result_t[] result = new result_t[str.Length];
// 切词
int cnt = ICTCLAS_ParagraphProcessAW(str, result, ecode, posTagged);
Word[] words = new Word[cnt];
byte[] mybyte = Encoding.Default.GetBytes(str);
for ( int i = 0; i < cnt; i++)
{
byte[] byteWord = new byte[result[i].length];
Array.Copy(mybyte, result[i].start, byteWord, 0, result[i].length);
words[i].str = Encoding.Default.GetString(mybyte, result[i].start, result[i].length);
words[i].pos_id = result[i].POS_id;
words[i].word_id = result[i].word_ID;
words[i].weight = result[i].weight;
words[i].word_type = result[i].word_type;
}
return words;
}
/// <summary>
/// 对文本文件切词
/// </summary>
/// <param name="src"> 源文件路径 </param>
/// <param name="ct"> 编码 </param>
/// <param name="des"> 目标文件路径 </param>
/// <param name="postag"> 是否词性标注 </param>
/// <returns> 切词是否成功 </returns>
public bool SegFile( string src, ECodeType ct, string des, bool postag = false)
{
return ICTCLAS_FileProcess(src, des, ct, postag);
}
/// <summary>
/// 设置词性标注集
/// </summary>
/// <param name="nPOSmap"> 词性标注集 </param>
/// <returns> 成功/失败 </returns>
public bool SetPosTagMap(EPosTag nPOSmap)
{
return ICTCLAS_SetPOSmap(nPOSmap);
}
/// <summary>
/// 从文件中导入用户词典
///
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="path"> 用户词典文件路径 </param>
/// <param name="ct"> 编码 </param>
/// <returns> 导入的用户自定词的数量 </returns>
public uint ImportUserDictFile( string path, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDictFile(path, ct);
}
/// <summary>
/// 导入用户词汇
/// </summary>
/// <param name="userDict">
/// 用户词汇
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="ct"> 编码 </param>
/// <returns> 导入的用户词汇数量 </returns>
public uint ImportUserDict( string userDict, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDict(userDict, userDict.Length, ct);
}
public bool SaveUserDict()
{
return ICTCLAS_SaveTheUsrDic() == 0 ? false : true;
}
const string DLLPATH = @" ICTCLAS50.dll ";
/// <summary>
/// 初始化,调用其它任何接口前,必须保证本接口调用成功!
/// </summary>
/// <param name="sInitDirPath"> 配置文件及data文件所在路径 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Init ")]
private static extern bool ICTCLAS_Init( string sInitDirPath);
/// <summary>
/// 释放资源,所有操作完成后,请调用本接口释放相关资源!
/// </summary>
/// <returns> 是否成功 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Exit ")]
private static extern bool ICTCLAS_Exit();
/// <summary>
/// 指定词性标注集
/// </summary>
/// <param name="nPOSmap"> 词性标注集 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SetPOSmap ")]
private static extern bool ICTCLAS_SetPOSmap(EPosTag nPOSmap);
/// <summary>
/// 导入用户自定义词典
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="pszFileName"> 用户词典路径名称 </param>
/// <param name="codeType"> 词典编码类型 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDictFile ")]
private static extern uint ICTCLAS_ImportUserDictFile( string pszFileName, ECodeType codeType = ECodeType.CODE_TYPE_UNKNOWN);
/// <summary>
/// 导入用户词典
/// 1.本接口将根据用户输入的词汇,生成相应的用户词典。
/// 2.该词典,将覆盖内存里原有的用户词典。
/// </summary>
/// <param name="pszDictBuffer">
/// 用户词典字符串
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="nLength"> 字符串长度 </param>
/// <param name="codeType"> 编码类型 </param>
/// <returns> 成功导入的词汇数量 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDict ")]
private static extern uint ICTCLAS_ImportUserDict( string pszDictBuffer, int length, ECodeType codeType);
/// <summary>
/// 保存用户词典
/// 1.本接口将会覆盖原有/data/文件夹用户相关词典。
/// 2.用户可在配置文件中,指定下次是否使用该词典。
/// </summary>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SaveTheUsrDic ")]
private static extern int ICTCLAS_SaveTheUsrDic();
/// <summary>
/// 切词处理
/// </summary>
/// <param name="sParagraph"> 要处理的文本 </param>
/// <param name="eCT"> 文本编码 </param>
/// <param name="bPOSTagged"> 是否词性标注 </param>
/// <param name="result"> 切词结果 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ParagraphProcessAW ")]
private static extern int ICTCLAS_ParagraphProcessAW( string sParagraph, [Out, MarshalAs(UnmanagedType.LPArray)]result_t[] result, ECodeType eCT, bool bPOSTagged = false);
/// <summary>
/// 文本文件分词
/// </summary>
/// <param name="sSrcFilename"> 待切词文件名 </param>
/// <param name="eCt"> 编码 </param>
/// <param name="sDsnFilename"> 目标文件名 </param>
/// <param name="bPOStagged"> 是否词性标注 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_FileProcess ")]
private static extern bool ICTCLAS_FileProcess( string sSrcFilename, string sDsnFilename, ECodeType eCt, bool bPOStagged = false);
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
namespace ICTCLAS.NET
{
//////////////////////////////////////////////////////////////////////// //
// character coding types
//////////////////////////////////////////////////////////////////////// //
public enum ECodeType
{
CODE_TYPE_UNKNOWN, // type unknown
CODE_TYPE_ASCII, // ASCII
CODE_TYPE_GB, // GB2312,GBK,GB10380
CODE_TYPE_UTF8, // UTF-8
CODE_TYPE_BIG5 // BIG5
}
public enum EPosTag
{
/// <summary>
/// 计算所二级标注集
/// </summary>
ICT_POS_MAP_SECOND = 0,
/// <summary>
/// 计算所一级标注集
/// </summary>
ICT_POS_MAP_FIRST = 1,
/// <summary>
/// 北大二级标注集
/// </summary>
PKU_POS_MAP_SECOND = 2,
/// <summary>
/// 北大一级标注集
/// </summary>
PKU_POS_MAP_FIRST = 3,
/// <summary>
/// 标注集 数量
/// </summary>
POS_MAP_NUMBER = 4,
/// <summary>
/// 词性标记最大字节数
/// </summary>
POS_SIZE = 8
}
[StructLayout(LayoutKind.Explicit)]
struct result_t
{
[FieldOffset( 0)]
public int start;
[FieldOffset( 4)]
public int length;
[FieldOffset( 8)]
public int sPos;
[FieldOffset( 12)]
public int sPosLow;
[FieldOffset( 16)]
public int POS_id;
[FieldOffset( 20)]
public int word_ID;
[FieldOffset( 24)]
public int word_type;
[FieldOffset( 28)]
public int weight;
}
public struct Word
{
public string str;
public int pos_id;
public int word_id;
public int weight;
public int word_type;
}
public class WordSegger
{
class Nested
{
static Nested()
{
}
internal static readonly WordSegger instance = new WordSegger();
}
private static object lockobj = new object();
private static bool inited = false;
/// <summary>
/// 获取分词器,第一次调用需要提供path参数
/// 以后调用不传参数即可
/// </summary>
/// <param name="path"> 指定配置文件和data文件位置 </param>
/// <returns></returns>
public static WordSegger GetInstance( string path = "")
{
if (inited)
{
return Nested.instance;
}
lock (lockobj)
{
if (!inited)
{
inited = ICTCLAS_Init(path);
if (!inited)
{
return null;
}
}
return Nested.instance;
}
}
/// <summary>
/// 主动释放切词所占的托管资源
/// </summary>
public static void Release()
{
if (inited)
{
lock (lockobj)
{
if (inited)
{
ICTCLAS_Exit();
inited = false;
}
}
}
}
/// <summary>
/// 构建切词对象
/// </summary>
private WordSegger()
{
}
/// <summary>
/// 析构函数,主动调用释放资源
/// </summary>
~WordSegger()
{
Release();
}
/// <summary>
/// 切词
/// </summary>
/// <param name="str"> 需要切词的字符串 </param>
/// <param name="ecode"> 编码 </param>
/// <param name="posTagged"> 是否进行词性标注 </param>
/// <returns></returns>
public Word[] SegStr( string str, ECodeType ecode, bool posTagged = false)
{
result_t[] result = new result_t[str.Length];
// 切词
int cnt = ICTCLAS_ParagraphProcessAW(str, result, ecode, posTagged);
Word[] words = new Word[cnt];
byte[] mybyte = Encoding.Default.GetBytes(str);
for ( int i = 0; i < cnt; i++)
{
byte[] byteWord = new byte[result[i].length];
Array.Copy(mybyte, result[i].start, byteWord, 0, result[i].length);
words[i].str = Encoding.Default.GetString(mybyte, result[i].start, result[i].length);
words[i].pos_id = result[i].POS_id;
words[i].word_id = result[i].word_ID;
words[i].weight = result[i].weight;
words[i].word_type = result[i].word_type;
}
return words;
}
/// <summary>
/// 对文本文件切词
/// </summary>
/// <param name="src"> 源文件路径 </param>
/// <param name="ct"> 编码 </param>
/// <param name="des"> 目标文件路径 </param>
/// <param name="postag"> 是否词性标注 </param>
/// <returns> 切词是否成功 </returns>
public bool SegFile( string src, ECodeType ct, string des, bool postag = false)
{
return ICTCLAS_FileProcess(src, des, ct, postag);
}
/// <summary>
/// 设置词性标注集
/// </summary>
/// <param name="nPOSmap"> 词性标注集 </param>
/// <returns> 成功/失败 </returns>
public bool SetPosTagMap(EPosTag nPOSmap)
{
return ICTCLAS_SetPOSmap(nPOSmap);
}
/// <summary>
/// 从文件中导入用户词典
///
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="path"> 用户词典文件路径 </param>
/// <param name="ct"> 编码 </param>
/// <returns> 导入的用户自定词的数量 </returns>
public uint ImportUserDictFile( string path, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDictFile(path, ct);
}
/// <summary>
/// 导入用户词汇
/// </summary>
/// <param name="userDict">
/// 用户词汇
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="ct"> 编码 </param>
/// <returns> 导入的用户词汇数量 </returns>
public uint ImportUserDict( string userDict, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDict(userDict, userDict.Length, ct);
}
public bool SaveUserDict()
{
return ICTCLAS_SaveTheUsrDic() == 0 ? false : true;
}
const string DLLPATH = @" ICTCLAS50.dll ";
/// <summary>
/// 初始化,调用其它任何接口前,必须保证本接口调用成功!
/// </summary>
/// <param name="sInitDirPath"> 配置文件及data文件所在路径 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Init ")]
private static extern bool ICTCLAS_Init( string sInitDirPath);
/// <summary>
/// 释放资源,所有操作完成后,请调用本接口释放相关资源!
/// </summary>
/// <returns> 是否成功 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_Exit ")]
private static extern bool ICTCLAS_Exit();
/// <summary>
/// 指定词性标注集
/// </summary>
/// <param name="nPOSmap"> 词性标注集 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SetPOSmap ")]
private static extern bool ICTCLAS_SetPOSmap(EPosTag nPOSmap);
/// <summary>
/// 导入用户自定义词典
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="pszFileName"> 用户词典路径名称 </param>
/// <param name="codeType"> 词典编码类型 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDictFile ")]
private static extern uint ICTCLAS_ImportUserDictFile( string pszFileName, ECodeType codeType = ECodeType.CODE_TYPE_UNKNOWN);
/// <summary>
/// 导入用户词典
/// 1.本接口将根据用户输入的词汇,生成相应的用户词典。
/// 2.该词典,将覆盖内存里原有的用户词典。
/// </summary>
/// <param name="pszDictBuffer">
/// 用户词典字符串
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="nLength"> 字符串长度 </param>
/// <param name="codeType"> 编码类型 </param>
/// <returns> 成功导入的词汇数量 </returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ImportUserDict ")]
private static extern uint ICTCLAS_ImportUserDict( string pszDictBuffer, int length, ECodeType codeType);
/// <summary>
/// 保存用户词典
/// 1.本接口将会覆盖原有/data/文件夹用户相关词典。
/// 2.用户可在配置文件中,指定下次是否使用该词典。
/// </summary>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_SaveTheUsrDic ")]
private static extern int ICTCLAS_SaveTheUsrDic();
/// <summary>
/// 切词处理
/// </summary>
/// <param name="sParagraph"> 要处理的文本 </param>
/// <param name="eCT"> 文本编码 </param>
/// <param name="bPOSTagged"> 是否词性标注 </param>
/// <param name="result"> 切词结果 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_ParagraphProcessAW ")]
private static extern int ICTCLAS_ParagraphProcessAW( string sParagraph, [Out, MarshalAs(UnmanagedType.LPArray)]result_t[] result, ECodeType eCT, bool bPOSTagged = false);
/// <summary>
/// 文本文件分词
/// </summary>
/// <param name="sSrcFilename"> 待切词文件名 </param>
/// <param name="eCt"> 编码 </param>
/// <param name="sDsnFilename"> 目标文件名 </param>
/// <param name="bPOStagged"> 是否词性标注 </param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = " ICTCLAS_FileProcess ")]
private static extern bool ICTCLAS_FileProcess( string sSrcFilename, string sDsnFilename, ECodeType eCt, bool bPOStagged = false);
}
}
整个工程代码:
ICTCLAS.NET.rar
ICTCLAS分词接口建议到官方下载,不过为了对应版本,也可以从如下地址下载:
转载请注明本文地址: ICTCLAS.NET——给C/C++程序写.NET wrapper