版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/wulinshishen/article/details/52962069
WebMagic是一款基于Java开源的简单灵活的爬虫框架,使用起来非常简单,与Scrapy爬虫框架很相似,WebMagic的原理与使用官方有详细的说明,这里就不再陈述,直接上实例。
这个WebMagic爬虫实例是结合Spring框架实现的,采用的是基于Redis的调度器,并对爬虫的过程进行了简单的监听。
WebMagic使用说明链接地址: http://webmagic.io/docs/zh/
GitHub项目链接地址: https://github.com/code4craft/webmagic
<!-- WebMagic依赖包 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>0.5.2</version>
</dependency>
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Resource;
import javax.management.JMException;
import org.platform.crawler.webmagic.modules.abstr.GenericCrawler;
import org.platform.crawler.webmagic.modules.job.pipeline.JobDBPipeline;
import org.platform.crawler.webmagic.scheduler.RedisScheduler;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.SpiderListener;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.monitor.SpiderMonitor.MonitorSpiderListener;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
@Component("jobCrawler")
public class JobCrawler extends GenericCrawler {
@Resource(name = "jobDBPipeline")
private JobDBPipeline jobDBPipeline = null;
@Resource(name = "redisScheduler")
private RedisScheduler redisScheduler = null;
public void startCrawl() {
List<String> initialize_urls = new ArrayList<String>();
String bj_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";
String sh_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=020000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";
String gz_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=030200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";
String sz_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=040000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";
String hz_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=080200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";
String cd_url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=090200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=2&curr_page=$curr_page$&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9";
String keyword = "%E5%B7%A5%E7%A8%8B%E5%B8%88";
initialize_urls.add(bj_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
initialize_urls.add(sh_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
initialize_urls.add(gz_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
initialize_urls.add(sz_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
initialize_urls.add(hz_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
initialize_urls.add(cd_url.replace("$keyword$", keyword).replace("$curr_page$", "1"));
List<Pipeline> pipelines = new ArrayList<Pipeline>();
pipelines.add(new ConsolePipeline());
pipelines.add(jobDBPipeline);
Spider jobSpider = Spider.create(new JobPageProcessor())
.addUrl(initialize_urls.toArray(new String[0]))
.setScheduler(redisScheduler)
.setPipelines(pipelines)
.thread(4);
try {
SpiderMonitor.instance().register(jobSpider);
} catch (JMException e) {
e.printStackTrace();
}
jobSpider.run();
List<SpiderListener> spiderListeners = jobSpider.getSpiderListeners();
for (SpiderListener spiderListener : spiderListeners) {
if (spiderListener instanceof MonitorSpiderListener) {
MonitorSpiderListener monitorSpiderListener = (MonitorSpiderListener) spiderListener;
System.out.println("success count: " + monitorSpiderListener.getSuccessCount());
System.out.println("error count: " + monitorSpiderListener.getErrorCount());
System.out.println("error urls: ");
for (String errorUrl : monitorSpiderListener.getErrorUrls()) {
System.out.println(errorUrl);
}
}
}
}
public static void main(String[] args) {
new JobCrawler().run();
}
}
可以通过setDownloader(new SeleniumDownloader("chromedriver.exe"))模拟浏览器
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.platform.crawler.webmagic.modules.job.entity.Job;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
@Component
public class JobPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10 * 1000);
@Override
public void process(Page page) {
List<String> divs = page.getHtml().xpath("//*[@id='resultList']/div[@class='el']").all();
List<Job> jobs = new ArrayList<Job>();
Job job = null;
for (int i = 1, len = divs.size(); i < len; i++) {
Document document = Jsoup.parse(divs.get(i));
Elements elements1 = document.select("p.t1 a");
Elements elements2 = document.select("span.t2 a");
Elements elements3 = document.select("span.t3");
Elements elements4 = document.select("span.t4");
Elements elements5 = document.select("span.t5");
for (int e = 0, elen = elements1.size(); e < elen; e++) {
job = new Job();
job.setJobName(elements1.get(e).text());
job.setJobUrl(elements1.get(e).attr("href"));
job.setCompanyName(elements2.get(e).text());
job.setWorkplace(elements3.get(e).text());
job.setSalary(elements4.get(e).text());
job.setPublishDate(elements5.get(e).text());
jobs.add(job);
}
}
page.putField("jobs", jobs);
String current_url = page.getRequest().getUrl();
String regex = "curr_page=\\d+";
Matcher matcher = Pattern.compile(regex).matcher(current_url);
if (matcher.find()) {
String curr_page_kv = matcher.group();
if (curr_page_kv.indexOf("=") != -1) {
int curr_page = Integer.parseInt(curr_page_kv.split("=")[1]) + 1;
page.addTargetRequest(current_url.replace(curr_page_kv, "curr_page=" + curr_page));
}
}
}
@Override
public Site getSite() {
return site;
}
}
Site中可以添加Header、Cookie、UserAgent、HttpProxy、Domain等信息
import java.util.List;
import javax.annotation.Resource;
import org.platform.crawler.webmagic.modules.abstr.mapper.GenericMapper;
import org.platform.crawler.webmagic.modules.abstr.pipeline.DBPipeline;
import org.platform.crawler.webmagic.modules.job.entity.Job;
import org.platform.crawler.webmagic.modules.job.mapper.JobMapper;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
@Component("jobDBPipeline")
public class JobDBPipeline extends DBPipeline<Job, Long> {
@Resource(name = "jobMapper")
private JobMapper jobMapper = null;
@Override
public GenericMapper<Job, Long> obtainMapperInstance() {
return jobMapper;
}
@Override
public void process(ResultItems resultItems, Task task) {
List<Job> jobs = resultItems.get("jobs");
for (int i = 0, len = jobs.size(); i < len; i++) {
jobMapper.insert(jobs.get(i));
}
}
}