//代码来自HanLP自然语言处理库,git地址:https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/utility/TextUtility.java
/** * 文本工具类 */ public class TextUtility { /** * 单字节 */ public static final int CT_SINGLE = 5;// SINGLE byte /** * 分隔符"!,.?()[]{}+= */ public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter /** * 中文字符 */ public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char /** * 字母 */ public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin /** * 数字 */ public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin /** * 序号 */ public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin /** * 中文数字 */ public static final int CT_CNUM = CT_SINGLE + 6; /** * 其他 */ public static final int CT_OTHER = CT_SINGLE + 12;// Other public static int charType(char c) { return charType(String.valueOf(c)); } /** * 判断字符类型 * @param str * @return */ public static int charType(String str) { if (str != null && str.length() > 0) { if ("零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".contains(str)) return CT_CNUM; byte[] b; try { b = str.getBytes("GBK"); } catch (UnsupportedEncodingException e) { b = str.getBytes(); e.printStackTrace(); } byte b1 = b[0]; byte b2 = b.length > 1 ? b[1] : 0; int ub1 = getUnsigned(b1); int ub2 = getUnsigned(b2); if (ub1 < 128) { if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS if (' ' == b1) return CT_OTHER; if ('\n' == b1) return CT_DELIMITER; if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1) return CT_DELIMITER; if ("0123456789".indexOf((char)b1) != -1) return CT_NUM; return CT_SINGLE; } else if (ub1 == 162) return CT_INDEX; else if (ub1 == 163 && ub2 > 175 && ub2 < 186) return CT_NUM; else if (ub1 == 163 && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225 && ub2 <= 250)) return CT_LETTER; else if (ub1 == 161 || ub1 == 163) return CT_DELIMITER; else if (ub1 >= 176 && ub1 <= 247) return CT_CHINESE; } return CT_OTHER; } /** * 是否全是中文 * @param str * @return */ public static boolean isAllChinese(String str) { return str.matches("[\\u4E00-\\u9FA5]+"); } /** * 是否全部不是中文 * @param sString * @return */ public static boolean isAllNonChinese(byte[] sString) { int nLen = sString.length; int i = 0; while (i < nLen) { if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175) return false; if (sString[i] < 0) i += 2; else i += 1; } return true; } /** * 是否全是单字节 * @param str * @return */ public static boolean isAllSingleByte(String str) { assert str != null; for (int i = 0; i < str.length(); i++) { if (str.charAt(i) >128) { return false; } } return true; } /** * 把表示数字含义的字符串转成整形 * * @param str 要转换的字符串 * @return 如果是有意义的整数,则返回此整数值。否则,返回-1。 */ public static int cint(String str) { if (str != null) try { int i = new Integer(str).intValue(); return i; } catch (NumberFormatException e) { } return -1; } /** * 是否全是数字 * @param str * @return */ public static boolean isAllNum(String str) { if (str == null) return false; int i = 0; /** 判断开头是否是+-之类的符号 */ if ("±+-+-—".indexOf(str.charAt(0)) != -1) i++; /** 如果是全角的0123456789 字符* */ while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; // Get middle delimiter such as . if (i > 0 && i < str.length()) { char ch = str.charAt(i); if ("·∶:,,..//".indexOf(ch) != -1) {// 98.1% i++; while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; } } if (i >= str.length()) return true; /** 如果是半角的0123456789字符* */ while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; // Get middle delimiter such as . if (i > 0 && i < str.length()) { char ch = str.charAt(i); if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·,./".indexOf(ch) != -1) {// 98.1% i++; while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) i++; } } if (i < str.length()) { if ("百千万亿佰仟%%‰".indexOf(str.charAt(i)) != -1) i++; } if (i >= str.length()) return true; return false; } /** * 是否全是序号 * @param sString * @return */ public static boolean isAllIndex(byte[] sString) { int nLen = sString.length; int i = 0; while (i < nLen - 1 && getUnsigned(sString[i]) == 162) { i += 2; } if (i >= nLen) return true; while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1) || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1)) {// single // byte // number // char i += 1; } if (i < nLen) return false; return true; } /** * 是否全为英文 * * @param text * @return */ public static boolean isAllLetter(String text) { for (int i = 0; i < text.length(); ++i) { char c = text.charAt(i); if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')))) { return false; } } return true; } /** * 是否全为英文或字母 * * @param text * @return */ public static boolean isAllLetterOrNum(String text) { for (int i = 0; i < text.length(); ++i) { char c = text.charAt(i); if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9')))) { return false; } } return true; } /** * 是否全是分隔符 * @param sString * @return */ public static boolean isAllDelimiter(byte[] sString) { int nLen = sString.length; int i = 0; while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163)) { i += 2; } if (i < nLen) return false; return true; } /** * 是否全是中国数字 * @param word * @return */ public static boolean isAllChineseNum(String word) {// 百分之五点六的人早上八点十八分起床 String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";// String prefix = "几数上第"; String surfix = "几多余来成倍"; boolean round = false; if (word == null) return false; char[] temp = word.toCharArray(); for (int i = 0; i < temp.length; i++) { if (word.startsWith("分之", i))// 百分之五 { i += 1; continue; } char tchar = temp[i]; if (i == 0 && prefix.indexOf(tchar) != -1) { round = true; } else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1) { round = true; } else if (chineseNum.indexOf(tchar) == -1) return false; } return true; } /** * 得到字符集的字符在字符串中出现的次数 * * @param charSet * @param word * @return */ public static int getCharCount(String charSet, String word) { int nCount = 0; if (word != null) { String temp = word + " "; for (int i = 0; i < word.length(); i++) { String s = temp.substring(i, i + 1); if (charSet.indexOf(s) != -1) nCount++; } } return nCount; } /** * 获取字节对应的无符号整型数 * * @param b * @return */ public static int getUnsigned(byte b) { if (b > 0) return (int) b; else return (b & 0x7F + 128); } /** * 判断字符串是否是年份 * * @param snum * @return */ public static boolean isYearTime(String snum) { if (snum != null) { int len = snum.length(); String first = snum.substring(0, 1); // 1992年, 98年,06年 if (isAllSingleByte(snum) && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0))) return true; if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1)) return true; if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2) return true; if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年 return true; if (len == 1 && getCharCount("千仟", snum) == 1) return true; if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1 && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1) return true; } return false; } /** * 判断一个字符串的所有字符是否在另一个字符串集合中 * * @param aggr 字符串集合 * @param str 需要判断的字符串 * @return */ public static boolean isInAggregate(String aggr, String str) { if (aggr != null && str != null) { str += "1"; for (int i = 0; i < str.length(); i++) { String s = str.substring(i, i + 1); if (aggr.indexOf(s) == -1) return false; } return true; } return false; } /** * 判断该字符串是否是半角字符 * * @param str * @return */ public static boolean isDBCCase(String str) { if (str != null) { str += " "; for (int i = 0; i < str.length(); i++) { String s = str.substring(i, i + 1); int length = 0; try { length = s.getBytes("GBK").length; } catch (UnsupportedEncodingException e) { e.printStackTrace(); length = s.getBytes().length; } if (length != 1) return false; } return true; } return false; } /** * 判断该字符串是否是全角字符 * * @param str * @return */ public static boolean isSBCCase(String str) { if (str != null) { str += " "; for (int i = 0; i < str.length(); i++) { String s = str.substring(i, i + 1); int length = 0; try { length = s.getBytes("GBK").length; } catch (UnsupportedEncodingException e) { e.printStackTrace(); length = s.getBytes().length; } if (length != 2) return false; } return true; } return false; } /** * 判断是否是一个连字符(分隔符) * * @param str * @return */ public static boolean isDelimiter(String str) { if (str != null && ("-".equals(str) || "-".equals(str))) return true; else return false; } public static boolean isUnknownWord(String word) { if (word != null && word.indexOf("未##") == 0) return true; else return false; } /** * 防止频率为0发生除零错误 * * @param frequency * @return */ public static double nonZero(double frequency) { if (frequency == 0) return 1e-3; return frequency; } /** * 转换long型为char数组 * * @param x */ public static char[] long2char(long x) { char[] c = new char[4]; c[0] = (char) (x >> 48); c[1] = (char) (x >> 32); c[2] = (char) (x >> 16); c[3] = (char) (x); return c; } /** * 转换long类型为string * * @param x * @return */ public static String long2String(long x) { char[] cArray = long2char(x); StringBuilder sbResult = new StringBuilder(cArray.length); for (char c : cArray) { sbResult.append(c); } return sbResult.toString(); } /** * 将异常转为字符串 * * @param e * @return */ public static String exceptionToString(Exception e) { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); e.printStackTrace(pw); return sw.toString(); } /** * 判断某个字符是否为汉字 * * @param c 需要判断的字符 * @return 是汉字返回true,否则返回false */ public static boolean isChinese(char c) { String regex = "[\\u4e00-\\u9fa5]"; return String.valueOf(c).matches(regex); } /** * 统计 keyword 在 srcText 中的出现次数 * * @param keyword * @param srcText * @return */ public static int count(String keyword, String srcText) { int count = 0; int leng = srcText.length(); int j = 0; for (int i = 0; i < leng; i++) { if (srcText.charAt(i) == keyword.charAt(j)) { j++; if (j == keyword.length()) { count++; j = 0; } } else { i = i - j;// should rollback when not match j = 0; } } return count; } /** * 简单好用的写String方式 * * @param s * @param out * @throws IOException */ public static void writeString(String s, DataOutputStream out) throws IOException { out.writeInt(s.length()); for (char c : s.toCharArray()) { out.writeChar(c); } } /** * 判断字符串是否为空(null和空格) * * @param cs * @return */ public static boolean isBlank(CharSequence cs) { int strLen; if (cs == null || (strLen = cs.length()) == 0) { return true; } for (int i = 0; i < strLen; i++) { if (!Character.isWhitespace(cs.charAt(i))) { return false; } } return true; }