1.完善爬虫代码
package com.sj.bd;
import java.io.File;
import java.io.IOException;
import java.util.List;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
//Excel处理类
public class ExcelSolve {
//写入Excel中
public void writeExcel(List<Job> list,String path,String keyWord) throws IOException, RowsExceededException, WriteException, InterruptedException{
File file = new File(path);//加入内存
WritableWorkbook workbook = Workbook.createWorkbook(file);//创建工作簿
WritableSheet sheet = workbook.createSheet("中华英才网-"+keyWord, 0);//添加页
//写入表头
sheet.addCell(new Label(0, 0, "城市/区域"));
sheet.addCell(new Label(1, 0, "公司名称"));
sheet.addCell(new Label(2, 0, "日期"));
sheet.addCell(new Label(3, 0, "岗位信息"));
sheet.addCell(new Label(4, 0, "薪资"));
sheet.addCell(new Label(5, 0, "公司性质"));
sheet.addCell(new Label(6, 0, "学历/工作年限"));
sheet.addCell(new Label(7, 0, "公司人数"));
//写入内容
int row=1;
for (int i = 0; i <list.size(); i++){
Job job = list.get(i);
sheet.addCell(new Label(0, row,job.getCityArea() ));
sheet.addCell(new Label(1, row,job.getCompany() ));
sheet.addCell(new Label(2, row,job.getDate() ));
sheet.addCell(new Label(3, row,job.getJobName() ));
sheet.addCell(new Label(4, row,job.getMoney() ));
sheet.addCell(new Label(5, row,job.getNacture() ));
sheet.addCell(new Label(6, row,job.getRecord() ));
sheet.addCell(new Label(7, row,job.getNum() ));
row++;
}
//workbook的写入
workbook.write();
//缓冲
Thread.sleep(2000);
//workbook的关闭
workbook.close();
}
}
package com.sj.bd;
//工作信息类
public class Job {
public String jobName;//岗位信息
public String date;//发布时间
public String company;//公司名称
public String cityArea;//城市/区域
public String record;//学历/年限
public String nacture;//公司性质
public String num;//人数
public String money;//工资
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
public String getCityArea() {
return cityArea;
}
public void setCityArea(String cityArea) {
this.cityArea = cityArea;
}
public String getRecord() {
return record;
}
public void setRecord(String record) {
this.record = record;
}
public String getNacture() {
return nacture;
}
public void setNacture(String nacture) {
this.nacture = nacture;
}
public String getNum() {
return num;
}
public void setNum(String num) {
this.num = num;
}
public String getMoney() {
return money;
}
public void setMoney(String money) {
this.money = money;
}
@Override
public String toString() {
return "Job [cityArea=" + cityArea + ", company=" + company + ", date="
+ date + ", jobName=" + jobName + ", money=" + money
+ ", nacture=" + nacture + ", num=" + num + ", record="
+ record + "]";
}
}
package com.sj.bd;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
//爬虫处理类
public class JsoupHtml {
List<Job> list=new ArrayList<Job>();//list存放数据
//获取document对象
public Document getDocument(String keyWord, String city ,int page)
throws IOException, InterruptedException {
String url = "http://www.chinahr.com/sou/?city="+city+"&keyword="+keyWord+"&page="+page;//url地址
System.out.println("当前url地址为:"+url);
Document content = Jsoup.connect(url).get();
Thread.sleep(10000);
return content;//获取文档
}
//对获取的文档进行分析
public List<Job> getDataInfo(Document document) throws InterruptedException{
//获取每个招聘信息
Elements jobList = document.getElementsByAttributeValue("class", "jobList");
Thread.sleep(10000);
System.out.println("获取到"+jobList.size()+"条数据");
//每个招聘信息进行解析
for (int i = 0; i < jobList.size(); i++) {
Job job = new Job();//实例化job
Element everyDiv = jobList.get(i);//获取每个div
// System.out.println(everyDiv);
Elements childrenElement = everyDiv.children();//获取所有子元素
for (int j = 0; j < childrenElement.size(); j++) {
Element childElement = childrenElement.get(j);//子元素的子元素,即每个ul
// System.out.println(childElement.child(0).text());
String[] liArray = childElement.child(0).text().split(" ");//第一个li,即是第一行信息
String[] liArray2 = childElement.child(1).text().split(" ");//第二个li,即是第二行信息
//写入list中
job.setJobName(liArray[0]);
job.setDate(liArray[1]);
job.setCompany(liArray[2]);
job.setCityArea(liArray2[0]);
job.setRecord(liArray2[1]);
job.setMoney(liArray2[2]+"元");
job.setNacture(liArray2[3]+liArray2[4]+liArray2[5]);
job.setNum(liArray2[7]);
}
list.add(job);//存入list
}
return list;
}
}
package com.sj.bd;
import java.io.IOException;
import java.util.List;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
import org.jsoup.nodes.Document;
//测试类
public class Test {
public static void main(String[] args) throws IOException, RowsExceededException, WriteException, InterruptedException {
String keyWord = "java";//搜索关键字
String city = "34%2C398";//城市
//下载10页
for (int i = 1; i <=10; i++) {
JsoupHtml jph= new JsoupHtml();//实例化
Document document = jph.getDocument(keyWord, city ,i);
//写入txt文档
/*FileUtils.writeStringToFile(new File("E:/zp/job/job.txt"), document.toString());
System.out.println("写入成功");*/
List<Job> list=jph.getDataInfo(document);//将获取到的信息装入list
Thread.sleep(2000);//缓冲
ExcelSolve esv = new ExcelSolve();//实例化
String path = "E:/zp/job/zhyc-"+keyWord+i+".xls";
esv.writeExcel(list,path,keyWord);//写入Excel中
System.out.println("写入成功了,且存在的位置为:"+path);
System.out.println("-------------------------");
}
}
}