一个页面静态化的工具类

package com.spider;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.Arrays;import java.util.Date;import java.util.List;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.io.IOUtils;/** * @author lpf * 一个URL缓存工具类 */public class Spider { static String refuseUrl[] = new String[] {"/shop/acart.do"};//多有不需要处理的url地址 static String[][] convert = new String[4][2] ;//url变更规则 static String[] v= new String[]{"<link.+href=\"(.+?)\"","<script.+src=\"(.+?)\"","<img.+src=\"(.+?)\"","url\\((.+?)\\)"} ;//给css,js,jpg,png 加版本号 static{ convert[0]=new String[]{ "^/(.+?).do$", "/%s.html"}; //处理以/开头,以.do结尾的 convert[1]=new String[]{ "^/(.+?).jsp$", "/%s.html"};//处理以/开头,以.jsp结尾的 convert[2]=new String[]{ "^/phone/peijian/index.jsp\\?type=(.+)", "/phone/peijian/index_type_%s.html"};//处理分类 convert[3]=new String[]{ "^/phone/product.do\\?id=(.+)", "/phone/product%s.html"};//处理产品 } public static void main(String[] args) throws Exception { System.out.println(addV(  staticUrl(getUrlHtml("http://shnk.familydoctor.com.cn/")) ) ); } /** * 给css,js,jpg,png 加版本号 * @author lpf * @param sb * @return */ public static StringBuffer  addV(StringBuffer sb)   { StringBuffer res=new StringBuffer(); for (int i = 0; i < v.length; i++) { res=new StringBuffer(); Pattern pattern = Pattern.compile( v[i]); Matcher matcher = pattern.matcher( sb);   while (matcher.find()) {   String v=matcher.group(0);   String v1=matcher.group(1);   matcher.appendReplacement(res,v.replace(v1 ,v1+"?v="+  new Date().getTime()));   }   matcher.appendTail(res);   sb=res; } return res; } /** * 得到一个网页的所有url * @author lpf * @param url * @return * @throws Exception */ public static List<String> getPageUrl(String url) throws  Exception {   List<String> ls=new ArrayList<String>();   StringBuffer sb= getUrlHtml(url);   Pattern pattern = Pattern.compile("<a.+href=\"(.+?)\"");   Matcher matcher = pattern.matcher( sb);   while (matcher.find()) {   ls.add( matcher.group(1));   }   return ls; } /** * 得到一个网页的html内容 * @author lpf * @param url * @return */ public static StringBuffer getUrlHtml(String url)  { List<String> ls=new ArrayList<String>(); try {   URL u = new URL(  url);   InputStream in = u.openStream();   ByteArrayOutputStream os=new ByteArrayOutputStream();   IOUtils.copy(in, os);   in.close();   StringBuffer sb=new StringBuffer( new String( os.toByteArray()) );   os.close();   return sb; } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } /** * 把一个网页的内容url地址全部静态化 * @author lpf * @param sb * @return */ public static StringBuffer staticUrl(StringBuffer sb)   {   Pattern pattern = Pattern.compile("<a.+href=\"(.+?)\"");   Matcher matcher = pattern.matcher( sb);   StringBuffer res=new StringBuffer();   while (matcher.find()) {   String v=matcher.group(0);   String v1=matcher.group(1);   matcher.appendReplacement(res,v.replace(v1 ,convertUrl( matcher.group(1))));   }   matcher.appendTail(res);   return res; } /** * 把一个url地址,通过规则变成另外个地址 * @author lpf * @param url * @return */ private static String convertUrl(String url)   { //先过滤不需要处理的页面 for (int i = 0; i < refuseUrl.length; i++) { if  (  refuseUrl[i].equals(url)) { return url; } } //按照规则改写地址 for (int i = 0; i < convert.length; i++) {   String []c=convert[i];   Pattern pattern = Pattern.compile( c[0]);   Matcher matcher = pattern.matcher( url);   while (matcher.find()) {   return String.format( c[1] ,matcher.group(1));   } } return url; }}

猜你喜欢

转载自ldico1231.iteye.com/blog/2197109