package text_category; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import com.xjt.nlp.word.ICTCLAS; import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer; import edu.udo.cs.wvtool.main.WVTDocumentInfo; import edu.udo.cs.wvtool.util.TokenEnumeration; public class ChineseTokenizer implements WVTTokenizer, TokenEnumeration { /** The underlying character stream of the currently tokenized document */ private Reader input; /** * The token, which is currently provided. This buffer is neccessary, to implement the semantic of TokenEnumeration */ private String currentToken; public ChineseTokenizer() { input = null; currentToken = null; } /** * @see edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer#tokenize(Reader, WVTDocumentInfo) */ public TokenEnumeration tokenize(Reader source, WVTDocumentInfo d) { if (source != null) { String resultstring = null; try { BufferedReader br = new BufferedReader(source); StringBuffer sb = new StringBuffer(); String inputstring = null; while ((inputstring = br.readLine()) != null) { sb.append(inputstring); } inputstring = sb.toString(); resultstring = inputstring; resultstring = ICTCLASCutWord(inputstring); }catch(Exception e) { return null; } input = new StringReader(resultstring); readNextToken(); return this; } else return null; } /** * Read a token from the character stream and store it into currentToken. If there are no more tokens left store a null value. * */ public void readNextToken() { StringBuffer buf = new StringBuffer(); boolean endReached = false; int in = 0; try { // Read from the stream, until a letter occurs in = input.read(); char ch = (char) in; while ((in != -1) && !Character.isLetter(ch)) { in = input.read(); ch = (char) in; } if (in != -1) buf.append(ch); // Read from the stream, util a non-letter occurs while ((in != -1) && Character.isLetter(ch)) { in = input.read(); ch = (char) in; if (Character.isLetter(ch)) buf.append(ch); } } catch (Exception e) { endReached = true; } if (in == -1) endReached = true; if (endReached) { // If the stream ended with a non-empty token, this is the last // token, otherwise there is no more token. if (buf.length() > 0) currentToken = buf.toString(); else currentToken = null; return; } else { // if the end of the stream has not been reached yet, simply store // the extracted token. currentToken = buf.toString(); return; } } /** * @see edu.udo.cs.wvtool.util.TokenEnumeration#hasMoreTokens() */ public boolean hasMoreTokens() { // If the current token does not equal the null value, then there is at // least this token left if (input != null) return (currentToken != null); else return false; } /** * @see edu.udo.cs.wvtool.util.TokenEnumeration#nextToken() */ public String nextToken() { String result = null; // If unequal null, return the current token and read another one from // the stream if (currentToken != null) { result = currentToken; readNextToken(); } else result = null; return result; } public static String ICTCLASCutWord(String inputstring) { String resultstring = null; try { ICTCLAS splitword = ICTCLAS.getInstance(); inputstring = inputstring.replace("\"", ""); inputstring = inputstring.replace("'", ""); inputstring = inputstring.replace("((", ""); inputstring = inputstring.replace("/", ""); inputstring = inputstring.replace(" ", ""); inputstring = inputstring.replace(">", ""); inputstring = inputstring.replace("<", ""); /*Character.UnicodeBlock ub; char[] ch = inputstring.toCharArray(); StringBuffer temp = new StringBuffer(); for (int c = 0; c < ch.length; c++) { ub = Character.UnicodeBlock.of(ch[c]); if ((ub == Character.UnicodeBlock.BASIC_LATIN) || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) || Character.isLetter(ch[c])) { temp.append(ch[c]); } } inputstring = temp.toString();*/ //System.out.println(inputstring); inputstring = splitword.paragraphProcess(inputstring); String[] immediatestrings = inputstring.split(" "); StringBuffer sb = new StringBuffer(); for (int i = 0; i < immediatestrings.length; i++) { if (immediatestrings[i].length() <= 0) continue; int end = immediatestrings[i].lastIndexOf("/"); String str = ""; if (end < 0 || end > immediatestrings[i].length()) { str = immediatestrings[i] + " "; } else { str = immediatestrings[i].substring(0, end) + " "; } sb.append(str); } resultstring = sb.toString(); }catch(Exception e) { return null; } return resultstring; } }
ChineseTokenizer implements WVTTokenizer, TokenEnumeration
猜你喜欢
转载自strayly.iteye.com/blog/2317090
今日推荐
周排行