一、分析网站内容
1、首先先查看自己有着文章列表的用户主页:https://blog.csdn.net/qq_36511401,发现并没有什么特别的地方,就是一个网站网址加上个人帐号。
2、当我们打开第二页的时候,就发现网址的连接变成了:https://blog.csdn.net/qq_36511401/article/list/2?,这真是一个好消息,而且新的网址实在原来的基础上加上/article/list/和当前的页数。
3、当我们输入:https://blog.csdn.net/qq_36511401/article/list/1?的时候,果然不出所料,指向的是第一页。所有我们只要在一个数据库中输入用户的主页链接,就可以获取这个用户所有文章的信息了。
二、分析网页内容
1、首先获取最大的页数。先获取全部的cssSelector("li[data-page]"),因为不止一个,所有要循环判断值最大的即为最大的页数。获取到最大的页数,我们就要循环几次,只要改变url就行了。
2、获取全部的文章item。cssSelector("article-item-box"),弄一个循环,将每个文章的内容都提出来。
3、获取文章的title。cssSelector("h4 > a"),再用text()提取文本就可以了。
4、获取文章简介。cssSelector(".content"),再用text()提取文本就可以了。
5、获取发布时间、阅读数和评论数获取到就你可以了。
三、核心代码。
1、数据库设计。两张表,一张csdn_user表为用户表,存放将要被爬取文章基本信息的用户主页链接。另一张csdn_article表为文章的基本信息表。
2、CsdnUserApi的代码。
package com.zxj.reptile.api.csdn;
import com.baomidou.mybatisplus.mapper.EntityWrapper;
import com.zxj.reptile.api.AjaxJson;
import com.zxj.reptile.module.csdn.entity.CsdnUser;
import com.zxj.reptile.module.csdn.service.ICsdnUserService;
import com.zxj.reptile.utils.ListUtils;
import com.zxj.reptile.utils.StringUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import java.util.Date;
import java.util.List;
@Api(value = "csdnUser", tags = "csdn用户")
@RestController
@RequestMapping("csdnUser")
public class CsdnUserApi {
@Autowired
private ICsdnUserService csdnUserService;
@ApiOperation(value = "添加表csdnUser内容", notes = "添加表csdnUser内容")
@ApiImplicitParams({
@ApiImplicitParam(name = "userUrl", value = "csdn用户主界面链接", dataType = "String", paramType = "query"),
@ApiImplicitParam(name = "userName", value = "csdn用户名", dataType = "String", paramType = "query"),
})
@RequestMapping(value = "addCsdnUser", method = RequestMethod.POST)
public AjaxJson addCsdnUser(@RequestParam(value = "userUrl") String userUrl,
@RequestParam(value = "userName") String userName) {
AjaxJson ajaxJson = new AjaxJson<>();
try {
CsdnUser csdnUser = csdnUserService.getCsdnUserByUrl(userUrl);
if (csdnUser != null) {
ajaxJson.setData(csdnUser);
ajaxJson.error("已存在该user了");
return ajaxJson;
}
CsdnUser entity = new CsdnUser();
entity.setCreateTime(new Date());
entity.setUserName(userName);
entity.setUserUrl(userUrl);
csdnUserService.insert(entity);
ajaxJson.setData(entity);
} catch (Exception e) {
e.printStackTrace();
ajaxJson.error("失败: " + e.getMessage());
}
return ajaxJson;
}
@ApiOperation(value = "获取表csdnUser信息", notes = "获取表csdnUser信息")
@ApiImplicitParams({
@ApiImplicitParam(name = "id", value = "id", dataType = "String", paramType = "query")
})
@RequestMapping(value = "getCsdnUser", method = RequestMethod.GET)
public AjaxJson getCsdnUser(@RequestParam(value = "id", required = false) String id) {
AjaxJson ajaxJson = new AjaxJson<>();
try {
EntityWrapper<CsdnUser> wrapper = new EntityWrapper<>();
if (StringUtils.isNotBlank(id)) {
wrapper.eq("id", id);
}
List<CsdnUser> entityList = csdnUserService.selectList(wrapper);
ajaxJson.setData(entityList);
} catch (Exception e) {
e.printStackTrace();
ajaxJson.error("失败: " + e.getMessage());
}
return ajaxJson;
}
@ApiOperation(value = "删除表csdnUser信息", notes = "删除表csdnUser信息")
@ApiImplicitParams({
@ApiImplicitParam(name = "ids", value = "id,多个用逗号分开", dataType = "String", paramType = "query")
})
@RequestMapping(value = "deleteCsdnUser", method = RequestMethod.DELETE)
public AjaxJson deleteCsdnUser(@RequestParam(value = "ids") String ids) {
AjaxJson ajaxJson = new AjaxJson<>();
try {
List<String> idList = StringUtils.commaSplit(ids);
if (ListUtils.isBlank(idList)) {
ajaxJson.error("参数格式出错");
return ajaxJson;
}
EntityWrapper<CsdnUser> wrapper = new EntityWrapper<>();
wrapper.in("id", idList);
boolean result = csdnUserService.delete(wrapper);
ajaxJson.setData(result);
} catch (Exception e) {
e.printStackTrace();
ajaxJson.error("失败: " + e.getMessage());
}
return ajaxJson;
}
}
3、CsdnArticleApi的代码。
package com.zxj.reptile.api.csdn;
import com.baomidou.mybatisplus.mapper.EntityWrapper;
import com.zxj.reptile.api.AjaxJson;
import com.zxj.reptile.module.csdn.entity.CsdnUser;
import com.zxj.reptile.module.csdn.service.ICsdnUserService;
import com.zxj.reptile.service.csdn.ReptileAticleUrlService;
import com.zxj.reptile.utils.StringUtils;
import com.zxj.reptile.utils.WebDriverUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;
import org.openqa.selenium.WebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import java.util.List;
@Api(value = "csdnArticle", tags = "csdn文章")
@RestController
@RequestMapping("csdnArticle")
public class CsdnArticleApi {
@Autowired
private ICsdnUserService csdnUserService;
@Autowired
private ReptileAticleUrlService aticleUrlService;
private Logger log = LoggerFactory.getLogger(getClass());
@ApiOperation(value = "csdn文章url采集", notes = "csdn文章url采集")
@ApiImplicitParams({
@ApiImplicitParam(name = "userUrl", value = "csdn用户主界面链接", dataType = "String", paramType = "query"),
@ApiImplicitParam(name = "articleMaxCount", value = "爬取的文章数", dataType = "String", paramType = "query"),
})
@RequestMapping(value = "reptileArticleUrl", method = RequestMethod.GET)
public AjaxJson reptileArticleUrl(@RequestParam(value = "userUrl", required = false) String userUser,
@RequestParam(value = "articleMaxCount", required = false) Integer articleMaxCount) {
AjaxJson ajaxJson = new AjaxJson<>();
log.info("========================== 开始-csdn文章url采集==========================");
WebDriver driver = WebDriverUtils.createWebDriver();
try {
EntityWrapper<CsdnUser> wrapper = new EntityWrapper<>();
wrapper.setSqlSelect("userUrl").orderBy("reptileTime");
if (StringUtils.isNotBlank(userUser)) {
wrapper.eq("userUser",userUser);
}
List<Object> csdnUsersList = csdnUserService.selectObjs(wrapper);
if (articleMaxCount == null) {
aticleUrlService.reptileArticleUrl(driver, csdnUsersList, 0);
} else {
aticleUrlService.reptileArticleUrl(driver, csdnUsersList, articleMaxCount);
}
} catch (Exception e) {
e.printStackTrace();
ajaxJson.error("失败: " + e.getMessage());
}
driver.quit();
log.info("========================== 结束-csdn文章url采集==========================");
return ajaxJson;
}
}
4、ReptileAticleUrlService的代码。
package com.zxj.reptile.service.csdn;
import com.baomidou.mybatisplus.mapper.EntityWrapper;
import com.zxj.reptile.module.csdn.entity.CsdnArticle;
import com.zxj.reptile.module.csdn.service.ICsdnArticleService;
import com.zxj.reptile.module.csdn.service.ICsdnUserService;
import com.zxj.reptile.utils.ListUtils;
import com.zxj.reptile.utils.PropertyUtils;
import com.zxj.reptile.utils.StringUtils;
import com.zxj.reptile.utils.WebDriverUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Service
public class ReptileAticleUrlService {
@Autowired
private ICsdnArticleService csdnArticleService;
@Autowired
private ICsdnUserService csdnUserService;
private Logger log = LoggerFactory.getLogger(getClass());
public void reptileArticleUrl(WebDriver driver, List<Object> userUrlList, int articleMaxCount) {
if (ListUtils.isBlank(userUrlList)) {
log.info("=============暂无user要被采集!==============");
return;
}
//多个用户的爬取
for (int i = 0; i < userUrlList.size(); i++) {
try {
String userUrl = (String) userUrlList.get(i);
csdnArticleService.deleteOldArticle(userUrl);//删除旧的
csdnUserService.updateReptileTime(userUrl);//更新reptileTime
driver.get(userUrl);
WebDriverUtils.sleep();
//获取最大的页数
int maxPageIndex = 1;
List<WebElement> pageBoxElements = driver.findElements(By.cssSelector("li[data-page]"));
for (WebElement pageBoxElement : pageBoxElements) {
int pageIndex = Integer.parseInt(pageBoxElement.getAttribute("data-page"));
if (pageIndex > maxPageIndex) {
maxPageIndex = pageIndex;
}
}
int totalCount = 0;
for (int j = 1; j <= maxPageIndex; j++) {
//刚开始进来的时候,就是第一页。
if (j != 1) {
driver.get(userUrl + "/article/list/" + j);
WebDriverUtils.sleep();
}
List<WebElement> itemElements = driver.findElements(By.cssSelector(".article-item-box"));
if (itemElements.isEmpty()) {
return;
}
//获取当前页的信息
List<CsdnArticle> entityList = new ArrayList<>();
for (WebElement itemElement : itemElements) {
CsdnArticle entity = new CsdnArticle();
entity.setUserUrl(userUrl);
entity.setCreateTime(new Date());
WebElement titleElement = itemElement.findElement(By.cssSelector("h4 > a"));
String title = titleElement.getText().trim();
String articleUrl = titleElement.getAttribute("href").trim();
WebElement contentElement = itemElement.findElement(By.cssSelector(".content"));
String content = contentElement.getText().trim();
WebElement timeElement = itemElement.findElement(By.cssSelector(".date"));
String time = timeElement.getText().trim();
WebElement readCountElement = itemElement.findElements(By.cssSelector(".num")).get(0);
int readCount = Integer.parseInt(readCountElement.getText().trim());
WebElement commentCountElement = itemElement.findElements(By.cssSelector(".num")).get(1);
int commentCount = Integer.parseInt(commentCountElement.getText().trim());
entity.setArticleUrl(articleUrl);
entity.setContent(content);
entity.setTitle(title);
entity.setCommentCount(commentCount);
entity.setReadCount(readCount);
try {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
entity.setPublishTime(sdf.parse(time));
} catch (ParseException e) {
e.printStackTrace();
}
entityList.add(entity);
}
csdnArticleService.insertBatch(entityList);
log.info(String.format("第%d/%d个用户的第%d/%d页,共采集%d条数据", i + 1, userUrlList.size(), j, maxPageIndex, entityList.size()));
//超过指定的数,就直接退出
totalCount += entityList.size();
if (articleMaxCount != 0 && totalCount > articleMaxCount) {
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
5、WebDriverUtils的代码。
package com.zxj.reptile.utils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* Created by deng on 2017/5/16.
*/
public class WebDriverUtils {
private static final int driver_manage_timeouts = 8;
public static WebDriver createWebDriver() {
ChromeOptions options = new ChromeOptions();
System.setProperty("webdriver.chrome.driver", PropertyUtils.getString("chrome.driver.path"));//chromedriver位置
options.setBinary(PropertyUtils.getString("chrome.path"));//chrome位置
Map<String, Object> prefs = new HashMap<>();
prefs.put("profile.managed_default_content_settings.images", 2);//禁止下载加载图片
options.setExperimentalOption("prefs", prefs);
WebDriver driver = new ChromeDriver(options);
driver.manage().timeouts().pageLoadTimeout(driver_manage_timeouts, TimeUnit.SECONDS);
return driver;
}
public static WebDriver createWebDriver(String url) {
WebDriver driver = createWebDriver();
driver.get(url);
sleep();
return driver;
}
public static WebDriver getNewWebDriver(WebDriver driver, String url) {
try {
driver.quit();
} catch (Exception ex) {
ex.printStackTrace();
}
WebDriver newDriver = createWebDriver();
newDriver.get(url);
sleep();
return newDriver;
}
//浏览器每次都用新的Tag
public static void getNewTag(WebDriver driver, String url) {
JavascriptExecutor executor = (JavascriptExecutor) driver;
executor.executeScript("function createDoc(){var w = window.open(); w.document.open(); w.document.write('<h1>Hello World!</h1>'); w.document.close();}; createDoc();");
sleep();
driver.close();
ArrayList<String> tabs = new ArrayList<>(driver.getWindowHandles());
driver.switchTo().window(tabs.get(0)); //switches to new tab
driver.get(url);
sleep();
}
public static void sleep() {
try {
Thread.sleep(PropertyUtils.getInt("chrome.sleep"));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static WebElement findEle(WebDriver driver, String cssSelector) {
List<WebElement> elementList = driver.findElements(By.cssSelector(cssSelector));
if (elementList == null || elementList.isEmpty()) {
return null;
} else {
return elementList.get(0);
}
}
public static void scrollBottom(WebDriver driver) {
JavascriptExecutor executor = (JavascriptExecutor) driver;
//移动到底端的。
executor.executeScript("window.scrollTo(0,document.body.scrollHeight)");
sleep();
//移动到窗口的顶端。
// executor.executeScript("window.scrollTo(0,0-document.body.scrollHeight)");
// sleep();
//移动到元素element对象的顶端与当前窗口的顶部对齐,scrollIntoView(默认是true),可加可不可加。
// WebElement testElement=driver.findElement(By.cssSelector("#just-a-test-id"));
// executor.executeScript("arguments[0].scrollIntoView();",testElement );
// executor.executeScript("arguments[0].scrollIntoView(true);",testElement );
// sleep();
//移动到元素element对象的底端与当前窗口的底部对齐。
// executor.executeScript("arguments[0].scrollIntoView(false);", testElement);
// sleep();
}
}