package test; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ContentChecker { public enum CheckRule{ $检查地区, $检查房型, $检查价格, $检查户型, $检查面积, $检查电话; } private static Map<String,String> _$1_rule_map = new HashMap<String,String>(); private static Map<String,String> _$2_rule_map = new HashMap<String,String>(); private static Map<String,String> _$3_rule_map = new HashMap<String,String>(); private static Map<String,String> _$4_rule_map = new HashMap<String,String>(); private static Map<String,String> _$5_rule_map = new HashMap<String,String>(); private static Map<String,String> _$6_rule_map = new HashMap<String,String>(); static{ _$1_rule_map.put("亞凱迪亞", "Arcadia|亚市|亞凱|阿凱迪亞|亞凱迪亞|亞凱迪"); _$1_rule_map.put("阿罕布拉市", "Alhambra|阿罕布拉|阿市|阿罕布拉市"); _$1_rule_map.put("亞凱迪亞","亞凱迪亞|Arcadia|亚市|亞凱|阿凱迪亞|亞凱迪亞|亞凱迪"); _$1_rule_map.put("聖蓋博","聖蓋博|San|Gabriel|聖市"); _$1_rule_map.put("天普市","天普市|Temple|city|天普"); _$1_rule_map.put("蒙羅維亞","蒙羅維亞|Monrovia"); _$1_rule_map.put("蒙特利公園","蒙特利|Monterey|Park|蒙市"); _$1_rule_map.put("阿罕布拉市","阿罕布拉|Alhambra|阿市|阿罕布拉市"); _$1_rule_map.put("蒙地貝婁","蒙地貝婁|Montebello"); _$1_rule_map.put("聖瑪利諾","聖瑪利諾|san|marino|聖市"); _$1_rule_map.put("帕莎迪那","帕莎迪那|Pasadena|帕莎迪娜|帕市"); _$1_rule_map.put("格蘭岱","格蘭岱|Glendale"); _$1_rule_map.put("拉朋地","拉朋地|La Puente"); _$1_rule_map.put("鮑溫公園","鮑溫公園|Baldwin|Park"); _$1_rule_map.put("艾爾蒙地","艾爾蒙地|El|Monte"); _$1_rule_map.put("柔似蜜","柔似蜜|Rosemead|柔市|柔似蜜"); _$1_rule_map.put("奇諾崗","奇諾崗|chino|hills"); _$1_rule_map.put("鑽石吧","鑽石吧|diamond|bar|corona"); _$1_rule_map.put("工業市","工業市|City|of|Industry"); _$1_rule_map.put("核桃市","核桃市|walnut"); _$1_rule_map.put("哈崗","哈崗|Hacienda|Height"); _$1_rule_map.put("羅蘭崗","羅蘭崗|Rowland|Heights"); _$1_rule_map.put("西柯汶納","西柯汶納|west|covina"); _$1_rule_map.put("波莫那","波莫那|Pomona"); _$1_rule_map.put("聖迪瑪斯","聖迪瑪斯|San|Dimas"); _$1_rule_map.put("克萊蒙","克萊蒙|Claremont"); _$1_rule_map.put("厄浦蘭","厄浦蘭|upland"); _$1_rule_map.put("赌城地产","赌城地产|Las|Vages"); _$2_rule_map.put("康斗","康斗|Condo"); _$2_rule_map.put("屋","独立屋|豪宅|House"); _$2_rule_map.put("办公","办公室|Office"); _$2_rule_map.put("仓库","仓库|Warehouse"); _$2_rule_map.put("公寓","公寓|亚寓|Apartment"); _$2_rule_map.put("移动","移动房屋|Mobile|Home"); _$2_rule_map.put("旅馆","汽车旅馆|旅馆|Motel"); _$2_rule_map.put("旺铺","旺铺|商铺|店|Store"); _$2_rule_map.put("洗车","洗车行|Car|Wash"); _$2_rule_map.put("加油","加油站|Gas|Station"); _$2_rule_map.put("PUD","PUD|PUD"); _$2_rule_map.put("土地","土地|Land"); _$2_rule_map.put("其他","其他|Other"); _$3_rule_map.put("价格:%s", "((售\\s*价|售|月\\s*租\\s*金|租\\s*金|月\\s*租|租|\\$)?\\s*\\d+\\s*(每\\s*月|月|\\/月|元|块)+)|((售\\s*价|售|月\\s*租\\s*金|租\\s*金|月\\s*租|租|\\$)+\\s*\\d+\\s*(每\\s*月|月|\\/月|元|块)?)"); _$4_rule_map.put("户型:%s", "(\\d+\\s*(室|厅|厨|卫))+"); _$5_rule_map.put("面积:%s", "((地大|佔地|占地|近)?\\s*\\d+\\s*(余尺|尺|呎)+\\d*)"); _$6_rule_map.put("电话:%s", "(\\d+-\\d+-\\d+)"); } public static String getPropValueFromContent(CheckRule proprule, String content){ String result = null; switch (proprule) { case $检查地区: result = matchInThis(_$1_rule_map,content); break; case $检查房型: result = findFirstInThis(_$2_rule_map,content); break; case $检查价格: result = findFirstInThis(_$3_rule_map,content); break; case $检查户型: result = findFirstInThis(_$4_rule_map,content); break; case $检查面积: result = findFirstInThis(_$5_rule_map,content); break; case $检查电话: result = findFirstInThis(_$6_rule_map,content); break; default: break; } return result; } public static void main(String[] args) { String testContent = "### Vages 阿市 sadfsadaew 收到罚单 Condo 罚单撒旦法撒发是 速度 #### \r\n" + "#### 租八万八#### \r\n" + "#### 三尺三#地大八萬#地大八万三千#近5千尺## \r\n" + "#### 租八百#### \r\n" + "#### 两室一厅一厨一卫月租两百块#### \r\n" + "#### 售 十亿壹 仟贰 佰伍 拾叁 万陆 仟 柒 佰 捌 拾 玖 元 整 #### \r\n" + "#### 售 价 三千五百万### \r\n" + "#### 电话 888-222-11111### \r\n" + "#### 壹仟零贰元整#### \r\n" + "#### $三十四万零二百每月#### \r\n" + "#### $5010万/月##### \r\n"; String newContent = ContentConvertUtil.ReplaceCNNumToInt(testContent); System.out.println(newContent); String find = getPropValueFromContent(CheckRule.$检查地区, newContent); System.out.println(String.format("$检查地区:[%s]", find)); find = getPropValueFromContent(CheckRule.$检查房型, newContent); System.out.println(String.format("$检查房型:[%s]", find)); find = getPropValueFromContent(CheckRule.$检查价格, newContent); System.out.println(String.format("$检查价格:[%s]", find)); find = getPropValueFromContent(CheckRule.$检查户型, newContent); System.out.println(String.format("$检查户型:[%s]", find)); find = getPropValueFromContent(CheckRule.$检查面积, newContent); System.out.println(String.format("$检查面积:[%s]", find)); find = getPropValueFromContent(CheckRule.$检查电话, newContent); System.out.println(String.format("$检查电话:[%s]", find)); } private static String findFirstInThis(Map<String, String> ruleMap, String content) { String result = null; Set<Entry<String, String>> entrySet = ruleMap.entrySet(); for (Entry<String, String> kvp : entrySet) { String regex = String.format("%s", kvp.getValue()); Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(content); String allStr = ""; while(matcher.find()){ allStr = allStr+"|"+matcher.group(); } if(!"".equals(allStr)){ result = String.format(kvp.getKey(),allStr); break; } // if(matcher.find()){ // result = String.format(kvp.getKey(),matcher.group()); // break; // } } return result; } private static String matchInThis(Map<String, String> ruleMap, String content) { String result = null; Set<Entry<String, String>> entrySet = ruleMap.entrySet(); for (Entry<String, String> kvp : entrySet) { String regex = String.format("(\\s|\\S)*(%s)+(\\s|\\S)*", kvp.getValue()); //System.out.println(String.format(">>>>>>>[%s]【%s】", regex,content)); Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(content); if(matcher.matches()){ result = kvp.getKey(); break; } } return result; } } class ContentConvertUtil{ /** * 全文替换 中文混合数字 为 纯数字,适用月正整数,最大支持 999999999 * eg:十亿壹 仟贰 佰伍 拾叁 万陆 仟 柒 佰 捌 拾 玖 元 整 * @param chinaInt * @return */ public static String ReplaceCNNumToInt(String chinaContent) { String regex = CNNumRegex+"+"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(chinaContent); StringBuffer sb = new StringBuffer(); while(matcher.find()){ Integer cnNumToInt = CNNumToInt(matcher.group()); String replaceStr = "*"; if(!cnNumToInt.equals(0)){ replaceStr = cnNumToInt.toString(); } //System.out.println("replace:["+matcher.group()+":"+replaceStr+"]"); matcher.appendReplacement(sb,replaceStr); } matcher.appendTail(sb); return sb.toString(); } /** * 中文混合数字 转 纯数字,适用月正整数,最大支持 999999999 * @param chinaInt * @return */ public static Integer CNNumToInt(String chinaInt) { String regex = CNNumRegex; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(chinaInt); List<Integer> resultSplit = new ArrayList<Integer>(); Integer currVal = 0; Integer prevUnit = 1; while(matcher.find()){ if(matcher.groupCount()==6){ //System.out.println("############matcher:["+matcher.group(2)+"]["+matcher.group(5)+"]"); String numValStr = matcher.group(2); String numUnitStr = matcher.group(5).replaceAll("\\s", ""); Integer numVal = 0; Integer numUnit = 1; if(isNum(numValStr)){ numVal = Integer.parseInt(numValStr); }else{ numVal = numMap.get(numValStr); } numVal = numVal==null?0:numVal; if("".equals(numUnitStr)){ numUnit = prevUnit>1?prevUnit/10:1; }else{ numUnit = unitMap.get(numUnitStr); } numUnit = numUnit==null?1:numUnit; prevUnit = numUnit; if(numUnit>=100000000){ currVal = currVal*100000000 + (numVal*numUnit); //System.out.println(">>>>>currVal["+currVal+"]"); resultSplit.add(currVal); currVal = 0; }else if(numUnit>=10000&&numUnit<100000000){ currVal = currVal*10000 + (numVal*numUnit); //System.out.println(">>>>>currVal["+currVal+"]"); resultSplit.add(currVal); currVal = 0; }else if(1>=1&&numUnit<10000){ currVal = currVal*1 + (numVal*numUnit); //System.out.println(">>>>>currVal["+currVal+"]"); } } } resultSplit.add(currVal); Integer result = 0; for (Integer val : resultSplit) { //System.out.println("######resultSplit["+val+"]"); result = result+val; } //System.out.println("result["+result+"]"); return result; } private static boolean isNum(String target){ target = target.replaceAll("\\s", ""); if(Pattern.compile("\\d+").matcher(target).matches()){ return true; } return false; } private static String CNNumRegex = "(((零|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|〇|一|二|三|四|五|六|七|八|九|十|两)|(\\d+))\\s*((十|拾|百|佰|千|仟|万|萬|亿|\\s)*))"; private static Map<String,Integer> numMap = new HashMap<String,Integer>(); private static Map<String,Integer> unitMap = new HashMap<String,Integer>(); static{ numMap.put("零", 0); numMap.put("壹", 1); numMap.put("贰", 2); numMap.put("叁", 3); numMap.put("肆", 4); numMap.put("伍", 5); numMap.put("陆", 6); numMap.put("柒", 7); numMap.put("捌", 8); numMap.put("玖", 9); numMap.put("拾",10); numMap.put("〇", 0); numMap.put("一", 1); numMap.put("二", 2); numMap.put("三", 3); numMap.put("四", 4); numMap.put("五", 5); numMap.put("六", 6); numMap.put("七", 7); numMap.put("八", 8); numMap.put("九", 9); numMap.put("十", 10); numMap.put("两", 2); } static{ unitMap.put("十", 10); unitMap.put("百", 100); unitMap.put("千", 1000); unitMap.put("万", 10000); unitMap.put("十万", 100000); unitMap.put("百万", 1000000); unitMap.put("千万", 10000000); unitMap.put("拾万", 100000); unitMap.put("佰万", 1000000); unitMap.put("仟万", 10000000); unitMap.put("拾", 10); unitMap.put("佰", 100); unitMap.put("仟", 1000); unitMap.put("萬", 10000); unitMap.put("拾萬", 100000); unitMap.put("佰萬", 1000000); unitMap.put("仟萬", 10000000); unitMap.put("十萬", 100000); unitMap.put("百萬", 1000000); unitMap.put("千萬", 10000000); unitMap.put("亿", 100000000); unitMap.put("拾亿", 1000000000); unitMap.put("十亿", 1000000000); unitMap.put(null, 1); } }
内容抓取匹配例子-中文数字转数字
猜你喜欢
转载自ian-jiang.iteye.com/blog/2250317
今日推荐
周排行