一个页面静态化的工具类

package com.spider;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.Arrays;import java.util.Date;import java.util.List;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.io.IOUtils;/** * @author lpf * 一个URL缓存工具类 */public class Spider { static String refuseUrl[] = new String[] {"/shop/acart.do"};//多有不需要处理的url地址 static String[][] convert = new String[4][2] ;//url变更规则 static String[] v= new String[]{"<link.+href=\"(.+?)\"","<script.+src=\"(.+?)\"","<img.+src=\"(.+?)\"","url\$(.+?)\$"} ;//给css,js,jpg,png 加版本号 static{ convert[0]=new String[]{ "^/(.+?).do$", "/%s.html"}; //处理以/开头，以.do结尾的 convert[1]=new String[]{ "^/(.+?).jsp$", "/%s.html"};//处理以/开头，以.jsp结尾的 convert[2]=new String[]{ "^/phone/peijian/index.jsp\\?type=(.+)", "/phone/peijian/index_type_%s.html"};//处理分类 convert[3]=new String[]{ "^/phone/product.do\\?id=(.+)", "/phone/product%s.html"};//处理产品 } public static void main(String[] args) throws Exception { System.out.println(addV( staticUrl(getUrlHtml("http://shnk.familydoctor.com.cn/")) ) ); } /** * 给css,js,jpg,png 加版本号 * @author lpf * @param sb * @return */ public static StringBuffer addV(StringBuffer sb) { StringBuffer res=new StringBuffer(); for (int i = 0; i < v.length; i++) { res=new StringBuffer(); Pattern pattern = Pattern.compile( v[i]); Matcher matcher = pattern.matcher( sb); while (matcher.find()) { String v=matcher.group(0); String v1=matcher.group(1); matcher.appendReplacement(res,v.replace(v1 ,v1+"?v="+ new Date().getTime())); } matcher.appendTail(res); sb=res; } return res; } /** * 得到一个网页的所有url * @author lpf * @param url * @return * @throws Exception */ public static List<String> getPageUrl(String url) throws Exception { List<String> ls=new ArrayList<String>(); StringBuffer sb= getUrlHtml(url); Pattern pattern = Pattern.compile("<a.+href=\"(.+?)\""); Matcher matcher = pattern.matcher( sb); while (matcher.find()) { ls.add( matcher.group(1)); } return ls; } /** * 得到一个网页的html内容 * @author lpf * @param url * @return */ public static StringBuffer getUrlHtml(String url) { List<String> ls=new ArrayList<String>(); try { URL u = new URL( url); InputStream in = u.openStream(); ByteArrayOutputStream os=new ByteArrayOutputStream(); IOUtils.copy(in, os); in.close(); StringBuffer sb=new StringBuffer( new String( os.toByteArray()) ); os.close(); return sb; } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } /** * 把一个网页的内容url地址全部静态化 * @author lpf * @param sb * @return */ public static StringBuffer staticUrl(StringBuffer sb) { Pattern pattern = Pattern.compile("<a.+href=\"(.+?)\""); Matcher matcher = pattern.matcher( sb); StringBuffer res=new StringBuffer(); while (matcher.find()) { String v=matcher.group(0); String v1=matcher.group(1); matcher.appendReplacement(res,v.replace(v1 ,convertUrl( matcher.group(1)))); } matcher.appendTail(res); return res; } /** * 把一个url地址，通过规则变成另外个地址 * @author lpf * @param url * @return */ private static String convertUrl(String url) { //先过滤不需要处理的页面 for (int i = 0; i < refuseUrl.length; i++) { if ( refuseUrl[i].equals(url)) { return url; } } //按照规则改写地址 for (int i = 0; i < convert.length; i++) { String []c=convert[i]; Pattern pattern = Pattern.compile( c[0]); Matcher matcher = pattern.matcher( url); while (matcher.find()) { return String.format( c[1] ,matcher.group(1)); } } return url; }}

一个页面静态化的工具类

猜你喜欢