转载自 java爬虫的2种爬取方式(HTTP||Socket)简单Demo(一)
最近在找java的小项目自己写着玩,但是找不到合适的,于是写开始学一点爬虫,自己也是感觉爬虫比较有趣。这里自己找了一个教程,这一次写的是基于Socket和http方式爬取.
小项目结构图:
(1)SystemContorl类,实现整个爬虫任务调度,爬取任务
package com.simple.control; import com.simple.Level.TaskLevel; import com.simple.manger.CrawlerManger; import com.simple.pojos.CrawlResultPojo; import com.simple.pojos.UrlPojo; import java.util.ArrayList; /** * * * Created by lewis on 2016/10/15. */ public class SystemControl { public static void main(String []args){ ArrayList<UrlPojo> urlPojoArrayList = new ArrayList<>(); urlPojoArrayList.add(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)); urlPojoArrayList.add(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)); int count=0; for( UrlPojo urlPojo:urlPojoArrayList){ CrawlerManger crawlerManger = new CrawlerManger(false); CrawlResultPojo crawlResultPojo = crawlerManger.crawl(urlPojo); System.out.println(crawlResultPojo.getPageContent()); count++; System.out.println("已经抓取了:"+count+"个页面"); } } }
(2)接口Icrawl为2种爬取方式进行统一的规范,2种爬取方式均实现此接口。
package com.simple.Icrawl; import com.simple.pojos.CrawlResultPojo; import com.simple.pojos.UrlPojo; /** * 实现类接口 * Created by lewis on 2016/10/15. */ public interface ICrawl { public CrawlResultPojo crawl(UrlPojo urlpojo); }
(3)为每个任务分优先级
package com.simple.Level; /** * 抓取任务的level级别 * Created by lewis on 2016/10/15. */ public enum TaskLevel { HIGH,MIDDLE,LOW }
(4)爬虫的任务类和结果类
1).爬虫所需的任务类,包含具体的爬取内容url,任务优先级等
package com.simple.pojos; import com.simple.Level.TaskLevel; import com.simple.crawImpl.HttpUrlConnectionCrawlerImpl; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; /** * @author lewis * url任务的类 * Created by lewis on 2016/10/15. */ public class UrlPojo { private String url; //网页URL private TaskLevel tasklevel=TaskLevel.MIDDLE;//URL的优先级等级 public UrlPojo(String url) { this.url = url; } public UrlPojo(String url, TaskLevel tasklevel) { this(url); this.tasklevel = tasklevel; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public TaskLevel getTasklevel() { return tasklevel; } public void setTasklevel(TaskLevel tasklevel) { this.tasklevel = tasklevel; } public String getHost(){ //获得主机名 URL Url=null; try { Url= new URL(this.url); } catch (MalformedURLException e) { e.printStackTrace(); } return Url.getHost(); } public HttpURLConnection getConnection(){ URL Url=null; try { Url= new URL(this.url); URLConnection conn = Url.openConnection(); if(conn instanceof HttpURLConnection) return (HttpURLConnection) conn; else throw new Exception("打开衔接失败"); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return null; } }
2).爬取后的结果集,所有的爬取结果保存在这个类中
package com.simple.pojos; /** * 抓取结果的封装 * Created by lewis on 2016/10/15. */ public class CrawlResultPojo { private boolean isSuccess; //是否已经成功 private String pageContent;//网页内容 private int HttpStatuCode;//HTTP 状态码 public boolean isSuccess() { return isSuccess; } public void setSuccess(boolean success) { isSuccess = success; } public String getPageContent() { return pageContent; } public void setPageContent(String pageContent) { this.pageContent = pageContent; } public int getHttpStatuCode() { return HttpStatuCode; } public void setHttpStatuCode(int httpStatuCode) { HttpStatuCode = httpStatuCode; } }
(5)爬虫管理,包括爬取方式的选择,查询查询爬取结果
package com.simple.manger; import com.simple.Icrawl.ICrawl; import com.simple.Level.TaskLevel; import com.simple.crawImpl.CrawlerImpl; import com.simple.crawImpl.HttpUrlConnectionCrawlerImpl; import com.simple.pojos.CrawlResultPojo; import com.simple.pojos.UrlPojo; import java.net.Socket; import java.util.Objects; /** * @author lewis * 包含业务逻辑的抓取管理器 * Created by lewis on 2016/10/15. */ public class CrawlerManger { private ICrawl crawler; public CrawlerManger(boolean isSocket) { if(isSocket){ this.crawler = new CrawlerImpl(); }else { this.crawler = new HttpUrlConnectionCrawlerImpl(); } } public CrawlResultPojo crawl(UrlPojo urlPojo){ return this.crawler.crawl(urlPojo); } }
(6)2种爬取方式:
1).Socket方式:
package com.simple.crawImpl; import com.simple.Icrawl.ICrawl; import com.simple.Level.TaskLevel; import com.simple.pojos.CrawlResultPojo; import com.simple.pojos.UrlPojo; import java.io.*; import java.net.Socket; /** * * 实现接口类 * Created by lewis on 2016/10/15. */ public class CrawlerImpl implements ICrawl{ //Socket抓取方式 @Override public CrawlResultPojo crawl(UrlPojo urlpojo) { //爬取url的内容,返回结果集合 CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if(urlpojo==null||urlpojo.getUrl()==null) { //若url为空,或URLpojo crawlResultPojo.setPageContent(null); crawlResultPojo.setSuccess(false); return crawlResultPojo; } String host=urlpojo.getHost(); BufferedWriter bw = null; BufferedReader br = null; Socket socket=null; if(host!=null){ try { /** * socket编程一般步骤 *(1) 创建Socket; *(2) 打开连接到Socket的输入/出流; *(3) 按照一定的协议对Socket进行读/写操作; *(4) 关闭Socket. * 其中address、host和port分别是双向连接中另一方的IP地址、主机名和端 口号, * stream指明socket是流socket还是数据报socket,localPort表示本地主机的端口号, * localAddr和 bindAddr是本地机器的地址(ServerSocket的主机地址) * */ socket=new Socket(host,80); bw = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream())); /** * HTTP1.1 * 它支持持续连接. * 与之相对的 * HTTP1.0 * 当连接建立起来以后,浏览器发送一个请求,之后一个回应消息被送回来.然后TCP连接被释放. * 所以发生了阻塞 * */ bw.write("GET "+urlpojo.getUrl()+" HTTP/1.0\r\n");//HTTP/1.1会发生组成 bw.write("HOST:"+host+"\r\n"); bw.write("\r\n");//在行的结束符\r\n之前没有任何数据,代表http head输出给服务器端的数据结束并完成 bw.flush(); //清空缓冲区 br=new BufferedReader(new InputStreamReader(socket.getInputStream(),"utf-8")); String line = null ; StringBuilder stringBuilder = new StringBuilder(); while((line=br.readLine())!=null){ stringBuilder.append(line+"\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (IOException e) { e.printStackTrace(); } finally { try { if (socket!=null)//防止出现空指针异常 socket.close();//释放资源,防止内存泄漏 if(br!=null) br.close(); if(bw!=null) bw.close(); } catch (IOException e) { e.printStackTrace(); System.out.println("流关闭失败"); } } } return null; } public static void main(String []args){ CrawlerImpl cl = new CrawlerImpl(); System.out.println(cl.crawl(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)).getPageContent()); System.out.println("done"); } }
2).HTTP方式:
package com.simple.crawImpl; import com.simple.Icrawl.ICrawl; import com.simple.Level.TaskLevel; import com.simple.pojos.CrawlResultPojo; import com.simple.pojos.UrlPojo; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.nio.Buffer; /** * Created by lewis on 2016/10/15. */ public class HttpUrlConnectionCrawlerImpl implements ICrawl{ //http 抓取方式 @Override public CrawlResultPojo crawl(UrlPojo urlpojo) { CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if(urlpojo==null||urlpojo.getUrl()==null) { //若url为空,或URLpojo crawlResultPojo.setPageContent(null); crawlResultPojo.setSuccess(false); return crawlResultPojo; } HttpURLConnection httpURLConnection = urlpojo.getConnection(); if(httpURLConnection!=null){ BufferedReader bufferedReader=null; try { bufferedReader= new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf-8")); String line =null; StringBuilder stringBuilder = new StringBuilder(); while((line=bufferedReader.readLine())!=null){ stringBuilder.append(line+"\n"); } crawlResultPojo.setPageContent(stringBuilder.toString()); crawlResultPojo.setSuccess(true); return crawlResultPojo; } catch (IOException e) { e.printStackTrace(); }finally { try { if (bufferedReader!=null) bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); } } } return null; } public static void main(String []args){ System.out.println(new HttpUrlConnectionCrawlerImpl().crawl(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)).getPageContent()); System.out.println("done"); } }