本文主要用Java selenium实现点击打开漫画
如果有selenium配置问题请前往从头学习爬虫(十)进阶篇----selenium回顾
未使用框架
import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.CompletionService; import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.apache.http.HttpEntity; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; public class GaReiZeroSpiderX{ public static void main(String[] args) { //主页 String url="https://manhua.dmzj.com/shiling"; //线程数 int threadsize=10; //延迟 long sleeptime=5000; //获取列表页 List<String> itemList=getListPage(url); //获取图片地址 List<String> imgList=getListImg(itemList); //多线程下载 DownLoadImg(imgList,threadsize,sleeptime); } private static List<String> getListImg(List<String> itemList) { List<String> listImg=new ArrayList<>(); if(itemList==null) { return null; } //配置驱动 System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe"); ChromeOptions options = new ChromeOptions(); //配置浏览器位置 options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"); //无头模式 59版本以上才可以 options.addArguments("test-type"); //ignore certificate errors options.addArguments("headless");// headless mode options.addArguments("disable-gpu"); //没啥用 本来可以用于页面显示模式设置 options.addArguments("Cookie:display_mode=1"); WebDriver driver = new ChromeDriver(options); for (String url : itemList) { url="https://manhua.dmzj.com"+url; driver.get(url); WebElement webElement = driver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Html html=new Html(content); String title=html.xpath("//title/text()").toString().split("-")[0]; List<Selectable> s=html.xpath("//div[@class='btmBtnBox']/select/option").nodes(); for (Selectable selectable : s) { //每一话的标题 每一页 图片地址 listImg.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value")); } } //关闭窗口 driver.close(); //关闭进程 driver.quit(); return listImg; } private static List<String> getListPage(String url) { CloseableHttpResponse response = null; try{ CloseableHttpClient httpClient = HttpClients.createDefault(); RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(1000).setConnectionRequestTimeout(1000).setSocketTimeout(1000).setRedirectsEnabled(true).build(); HttpGet httpGet = new HttpGet(url); httpGet.setConfig(requestConfig); httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"); response =httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() != 200) { System.out.println("request url failed, http code=" + response.getStatusLine().getStatusCode()); return null; }else{ HttpEntity entity1 = response.getEntity(); String resultStr = EntityUtils.toString(entity1, "utf-8"); Html html=new Html(resultStr); /* List<String> list=new ArrayList<>(); list.add(html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").toString());*/ return html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all(); } } catch (Exception e) { return null; } finally { if (response != null){ try { response.close(); } catch (IOException e) { e.printStackTrace(); } } } } private static void DownLoadImg(List<String> imgList, int threadsize, long sleeptime) { int count=0; int size=imgList.size(); ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadsize); CompletionService<String> cs = new ExecutorCompletionService<String>(fixedThreadPool); for (String url : imgList) { final String url1 = url; cs.submit(new Callable<String>() { public String call() throws Exception { try { Thread.sleep(sleeptime); return down(url1); } catch (InterruptedException e) { System.out.println("线程异常"); return "error_"+"url1"; } } }); } for (String url : imgList) { try { String a = cs.take().get(); if(a!=null) { count++; } } catch (Exception e) { e.printStackTrace(); }finally { if(count==size) { System.out.println("over"); }else { System.out.println(count+"/"+size); } } } fixedThreadPool.shutdown(); } protected static String down(String url) { try { url=url.replace(" ", ""); File dest1 = new File("D:/manhua"); if (!dest1.exists() && !dest1.isDirectory()) { dest1.mkdir(); } File dest2 = new File("D:/manhua/" + url.split("___")[0]); if (!dest2.exists() && !dest2.isDirectory()) { dest2.mkdir(); } File dest = new File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "." + url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]); if (!dest.exists()) { dest.createNewFile(); } //接收字节输入流 InputStream is; //字节输出流 FileOutputStream fos = new FileOutputStream(dest); URL temp; String imgurl=url.split("___")[2]; temp = new URL(imgurl.trim()); HttpURLConnection uc=(HttpURLConnection) temp.openConnection(); uc.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"); //必须加refer 防封 这个比较烂 写成百度地址也可以 uc.addRequestProperty("Referer", "https://manhua.dmzj.com/"); is=uc.getInputStream(); //为字节输入流加缓冲 BufferedInputStream bis = new BufferedInputStream(is); //为字节输出流加缓冲 BufferedOutputStream bos = new BufferedOutputStream(fos); int length; byte[] bytes = new byte[1024 * 20]; while ((length = bis.read(bytes, 0, bytes.length)) != -1) { fos.write(bytes, 0, length); } bos.close(); fos.close(); bis.close(); is.close(); return "success_"+"url1"; } catch (Exception e) { e.printStackTrace(); return "error_"+"url1"; } } }
webmagic框架
spider
import java.util.ArrayList; import java.util.List; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; public class GaReiZeroSpider implements PageProcessor{ static List<String> imgurl=new ArrayList<>(); private Site site =Site.me(); @Override public Site getSite() { return site ; } @Override public void process(Page page) { if(page.getUrl().toString().equals("https://manhua.dmzj.com/shiling")) { List<String> pageUrl=page.getHtml().xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all(); for (String string : pageUrl) { Request request=new Request("https://manhua.dmzj.com"+string); request.addHeader("Cookie", "display_mode=1"); page.addTargetRequest(request); } }else { String title=page.getHtml().xpath("//title/text()").toString().split("-")[0]; List<Selectable> s=page.getHtml().xpath("//div[@class='btmBtnBox']/select/option").nodes(); for (Selectable selectable : s) { imgurl.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value")); } page.putField("imgurl", imgurl); } } public static void main(String[] args) { Spider.create(new GaReiZeroSpider()).downloader(new GaReiZeroDownloader()).addPipeline(new GaReiZeroPipline()).addUrl("https://manhua.dmzj.com/shiling").start(); } }
downloader
import java.io.Closeable; import java.io.IOException; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; public class GaReiZeroDownloader implements Downloader, Closeable{ @Override public void close() throws IOException { } @Override public Page download(Request request, Task task) { System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe"); ChromeOptions options = new ChromeOptions(); options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"); options.addArguments("test-type"); //ignore certificate errors options.addArguments("headless");// headless mode options.addArguments("disable-gpu"); options.addArguments("Cookie:display_mode=1"); WebDriver driver = new ChromeDriver(options); driver.get(request.getUrl()); WebElement webElement = driver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); driver.close(); driver.quit(); return page; } @Override public void setThread(int threadNum) { } }
pipline
public class GaReiZeroPipline implements Pipeline{ @Override public void process(ResultItems resultItems, Task task) { try { if(null!=resultItems.get("imgurl")) { List<String> imgurl=resultItems.get("imgurl"); if(!imgurl.isEmpty()) { DownLoadImg(imgurl,5,500); } } } catch (Exception e) { } } private void DownLoadImg(List<String> imgList, int threadsize, long sleeptime) { int count=0; int size=imgList.size(); ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadsize); CompletionService<String> cs = new ExecutorCompletionService<String>(fixedThreadPool); for (String url : imgList) { final String url1 = url; cs.submit(new Callable<String>() { public String call() throws Exception { try { Thread.sleep(sleeptime); return down(url1); } catch (InterruptedException e) { System.out.println("线程异常"); return "error_"+"url1"; } } }); } for (String url : imgList) { try { String a = cs.take().get(); if(a!=null) { count++; } } catch (Exception e) { e.printStackTrace(); }finally { if(count==size) { System.out.println("over"); }else { System.out.println(count+"/"+size); } } } fixedThreadPool.shutdown(); } protected String down(String url) { try { url=url.replace(" ", ""); File dest1 = new File("D:/manhua"); if (!dest1.exists() && !dest1.isDirectory()) { dest1.mkdir(); } File dest2 = new File("D:/manhua/" + url.split("___")[0]); if (!dest2.exists() && !dest2.isDirectory()) { dest2.mkdir(); } File dest = new File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "." + url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]); if (!dest.exists()) { dest.createNewFile(); } //接收字节输入流 InputStream is; //字节输出流 FileOutputStream fos = new FileOutputStream(dest); URL temp; String imgurl=url.split("___")[2]; temp = new URL(imgurl.trim()); HttpURLConnection uc=(HttpURLConnection) temp.openConnection(); uc.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"); //必须加refer 防封 这个比较烂 写成百度地址也可以 uc.addRequestProperty("Referer", "https://manhua.dmzj.com/"); is=uc.getInputStream(); //为字节输入流加缓冲 BufferedInputStream bis = new BufferedInputStream(is); //为字节输出流加缓冲 BufferedOutputStream bos = new BufferedOutputStream(fos); int length; byte[] bytes = new byte[1024 * 20]; while ((length = bis.read(bytes, 0, bytes.length)) != -1) { fos.write(bytes, 0, length); } bos.close(); fos.close(); bis.close(); is.close(); return "success_"+"url1"; } catch (Exception e) { e.printStackTrace(); return "error_"+"url1"; } } }