WebDriver 采集csdn用户发表过的csdn文章信息

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/qq_36511401/article/details/102606797

一、分析网站内容

 1、首先先查看自己有着文章列表的用户主页:https://blog.csdn.net/qq_36511401,发现并没有什么特别的地方,就是一个网站网址加上个人帐号。

2、当我们打开第二页的时候,就发现网址的连接变成了:https://blog.csdn.net/qq_36511401/article/list/2?,这真是一个好消息,而且新的网址实在原来的基础上加上/article/list/和当前的页数。

3、当我们输入:https://blog.csdn.net/qq_36511401/article/list/1?的时候,果然不出所料,指向的是第一页。所有我们只要在一个数据库中输入用户的主页链接,就可以获取这个用户所有文章的信息了。

二、分析网页内容

1、首先获取最大的页数。先获取全部的cssSelector("li[data-page]"),因为不止一个,所有要循环判断值最大的即为最大的页数。获取到最大的页数,我们就要循环几次,只要改变url就行了。

2、获取全部的文章item。cssSelector("article-item-box"),弄一个循环,将每个文章的内容都提出来。

3、获取文章的title。cssSelector("h4 > a"),再用text()提取文本就可以了。

4、获取文章简介。cssSelector(".content"),再用text()提取文本就可以了。

5、获取发布时间、阅读数和评论数获取到就你可以了。

三、核心代码。

1、数据库设计。两张表,一张csdn_user表为用户表,存放将要被爬取文章基本信息的用户主页链接。另一张csdn_article表为文章的基本信息表。

2、CsdnUserApi的代码。

package com.zxj.reptile.api.csdn;


import com.baomidou.mybatisplus.mapper.EntityWrapper;
import com.zxj.reptile.api.AjaxJson;
import com.zxj.reptile.module.csdn.entity.CsdnUser;
import com.zxj.reptile.module.csdn.service.ICsdnUserService;
import com.zxj.reptile.utils.ListUtils;
import com.zxj.reptile.utils.StringUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

import java.util.Date;
import java.util.List;

@Api(value = "csdnUser", tags = "csdn用户")
@RestController
@RequestMapping("csdnUser")
public class CsdnUserApi {
    @Autowired
    private ICsdnUserService csdnUserService;

    @ApiOperation(value = "添加表csdnUser内容", notes = "添加表csdnUser内容")
    @ApiImplicitParams({
            @ApiImplicitParam(name = "userUrl", value = "csdn用户主界面链接", dataType = "String", paramType = "query"),
            @ApiImplicitParam(name = "userName", value = "csdn用户名", dataType = "String", paramType = "query"),
    })
    @RequestMapping(value = "addCsdnUser", method = RequestMethod.POST)
    public AjaxJson addCsdnUser(@RequestParam(value = "userUrl") String userUrl,
                                @RequestParam(value = "userName") String userName) {
        AjaxJson ajaxJson = new AjaxJson<>();
        try {
            CsdnUser csdnUser = csdnUserService.getCsdnUserByUrl(userUrl);
            if (csdnUser != null) {
                ajaxJson.setData(csdnUser);
                ajaxJson.error("已存在该user了");
                return ajaxJson;
            }
            CsdnUser entity = new CsdnUser();
            entity.setCreateTime(new Date());
            entity.setUserName(userName);
            entity.setUserUrl(userUrl);
            csdnUserService.insert(entity);
            ajaxJson.setData(entity);
        } catch (Exception e) {
            e.printStackTrace();
            ajaxJson.error("失败: " + e.getMessage());
        }
        return ajaxJson;
    }

    @ApiOperation(value = "获取表csdnUser信息", notes = "获取表csdnUser信息")
    @ApiImplicitParams({
            @ApiImplicitParam(name = "id", value = "id", dataType = "String", paramType = "query")
    })
    @RequestMapping(value = "getCsdnUser", method = RequestMethod.GET)
    public AjaxJson getCsdnUser(@RequestParam(value = "id", required = false) String id) {
        AjaxJson ajaxJson = new AjaxJson<>();
        try {
            EntityWrapper<CsdnUser> wrapper = new EntityWrapper<>();
            if (StringUtils.isNotBlank(id)) {
                wrapper.eq("id", id);
            }
            List<CsdnUser> entityList = csdnUserService.selectList(wrapper);
            ajaxJson.setData(entityList);
        } catch (Exception e) {
            e.printStackTrace();
            ajaxJson.error("失败: " + e.getMessage());
        }
        return ajaxJson;
    }

    @ApiOperation(value = "删除表csdnUser信息", notes = "删除表csdnUser信息")
    @ApiImplicitParams({
            @ApiImplicitParam(name = "ids", value = "id,多个用逗号分开", dataType = "String", paramType = "query")
    })
    @RequestMapping(value = "deleteCsdnUser", method = RequestMethod.DELETE)
    public AjaxJson deleteCsdnUser(@RequestParam(value = "ids") String ids) {
        AjaxJson ajaxJson = new AjaxJson<>();
        try {
            List<String> idList = StringUtils.commaSplit(ids);
            if (ListUtils.isBlank(idList)) {
                ajaxJson.error("参数格式出错");
                return ajaxJson;
            }
            EntityWrapper<CsdnUser> wrapper = new EntityWrapper<>();
            wrapper.in("id", idList);
            boolean result = csdnUserService.delete(wrapper);
            ajaxJson.setData(result);
        } catch (Exception e) {
            e.printStackTrace();
            ajaxJson.error("失败: " + e.getMessage());
        }
        return ajaxJson;
    }
}

3、CsdnArticleApi的代码。

package com.zxj.reptile.api.csdn;


import com.baomidou.mybatisplus.mapper.EntityWrapper;
import com.zxj.reptile.api.AjaxJson;
import com.zxj.reptile.module.csdn.entity.CsdnUser;
import com.zxj.reptile.module.csdn.service.ICsdnUserService;
import com.zxj.reptile.service.csdn.ReptileAticleUrlService;
import com.zxj.reptile.utils.StringUtils;
import com.zxj.reptile.utils.WebDriverUtils;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;
import org.openqa.selenium.WebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

import java.util.List;

@Api(value = "csdnArticle", tags = "csdn文章")
@RestController
@RequestMapping("csdnArticle")
public class CsdnArticleApi {
    @Autowired
    private ICsdnUserService csdnUserService;
    @Autowired
    private ReptileAticleUrlService aticleUrlService;

    private Logger log = LoggerFactory.getLogger(getClass());

    @ApiOperation(value = "csdn文章url采集", notes = "csdn文章url采集")
    @ApiImplicitParams({
            @ApiImplicitParam(name = "userUrl", value = "csdn用户主界面链接", dataType = "String", paramType = "query"),
            @ApiImplicitParam(name = "articleMaxCount", value = "爬取的文章数", dataType = "String", paramType = "query"),
    })
    @RequestMapping(value = "reptileArticleUrl", method = RequestMethod.GET)
    public AjaxJson reptileArticleUrl(@RequestParam(value = "userUrl", required = false) String userUser,
                                      @RequestParam(value = "articleMaxCount", required = false) Integer articleMaxCount) {
        AjaxJson ajaxJson = new AjaxJson<>();
        log.info("========================== 开始-csdn文章url采集==========================");
        WebDriver driver = WebDriverUtils.createWebDriver();
        try {
            EntityWrapper<CsdnUser> wrapper = new EntityWrapper<>();
            wrapper.setSqlSelect("userUrl").orderBy("reptileTime");
            if (StringUtils.isNotBlank(userUser)) {
                wrapper.eq("userUser",userUser);
            }
            List<Object> csdnUsersList = csdnUserService.selectObjs(wrapper);
            if (articleMaxCount == null) {
                aticleUrlService.reptileArticleUrl(driver, csdnUsersList, 0);
            } else {
                aticleUrlService.reptileArticleUrl(driver, csdnUsersList, articleMaxCount);
            }
        } catch (Exception e) {
            e.printStackTrace();
            ajaxJson.error("失败: " + e.getMessage());
        }
        driver.quit();
        log.info("========================== 结束-csdn文章url采集==========================");
        return ajaxJson;
    }
}

4、ReptileAticleUrlService的代码。

package com.zxj.reptile.service.csdn;

import com.baomidou.mybatisplus.mapper.EntityWrapper;
import com.zxj.reptile.module.csdn.entity.CsdnArticle;
import com.zxj.reptile.module.csdn.service.ICsdnArticleService;
import com.zxj.reptile.module.csdn.service.ICsdnUserService;
import com.zxj.reptile.utils.ListUtils;
import com.zxj.reptile.utils.PropertyUtils;
import com.zxj.reptile.utils.StringUtils;
import com.zxj.reptile.utils.WebDriverUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

@Service
public class ReptileAticleUrlService {
    @Autowired
    private ICsdnArticleService csdnArticleService;
    @Autowired
    private ICsdnUserService csdnUserService;

    private Logger log = LoggerFactory.getLogger(getClass());


    public void reptileArticleUrl(WebDriver driver, List<Object> userUrlList, int articleMaxCount) {
        if (ListUtils.isBlank(userUrlList)) {
            log.info("=============暂无user要被采集!==============");
            return;
        }
        //多个用户的爬取
        for (int i = 0; i < userUrlList.size(); i++) {
            try {
                String userUrl = (String) userUrlList.get(i);
                csdnArticleService.deleteOldArticle(userUrl);//删除旧的
                csdnUserService.updateReptileTime(userUrl);//更新reptileTime
                driver.get(userUrl);
                WebDriverUtils.sleep();
                //获取最大的页数
                int maxPageIndex = 1;
                List<WebElement> pageBoxElements = driver.findElements(By.cssSelector("li[data-page]"));
                for (WebElement pageBoxElement : pageBoxElements) {
                    int pageIndex = Integer.parseInt(pageBoxElement.getAttribute("data-page"));
                    if (pageIndex > maxPageIndex) {
                        maxPageIndex = pageIndex;
                    }
                }
                int totalCount = 0;
                for (int j = 1; j <= maxPageIndex; j++) {
                    //刚开始进来的时候,就是第一页。
                    if (j != 1) {
                        driver.get(userUrl + "/article/list/" + j);
                        WebDriverUtils.sleep();
                    }
                    List<WebElement> itemElements = driver.findElements(By.cssSelector(".article-item-box"));
                    if (itemElements.isEmpty()) {
                        return;
                    }
                    //获取当前页的信息
                    List<CsdnArticle> entityList = new ArrayList<>();
                    for (WebElement itemElement : itemElements) {
                        CsdnArticle entity = new CsdnArticle();
                        entity.setUserUrl(userUrl);
                        entity.setCreateTime(new Date());
                        WebElement titleElement = itemElement.findElement(By.cssSelector("h4 > a"));
                        String title = titleElement.getText().trim();
                        String articleUrl = titleElement.getAttribute("href").trim();
                        WebElement contentElement = itemElement.findElement(By.cssSelector(".content"));
                        String content = contentElement.getText().trim();
                        WebElement timeElement = itemElement.findElement(By.cssSelector(".date"));
                        String time = timeElement.getText().trim();
                        WebElement readCountElement = itemElement.findElements(By.cssSelector(".num")).get(0);
                        int readCount = Integer.parseInt(readCountElement.getText().trim());
                        WebElement commentCountElement = itemElement.findElements(By.cssSelector(".num")).get(1);
                        int commentCount = Integer.parseInt(commentCountElement.getText().trim());
                        entity.setArticleUrl(articleUrl);
                        entity.setContent(content);
                        entity.setTitle(title);
                        entity.setCommentCount(commentCount);
                        entity.setReadCount(readCount);
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
                            entity.setPublishTime(sdf.parse(time));
                        } catch (ParseException e) {
                            e.printStackTrace();
                        }
                        entityList.add(entity);
                    }
                    csdnArticleService.insertBatch(entityList);
                    log.info(String.format("第%d/%d个用户的第%d/%d页,共采集%d条数据", i + 1, userUrlList.size(), j, maxPageIndex, entityList.size()));
                    //超过指定的数,就直接退出
                    totalCount += entityList.size();
                    if (articleMaxCount != 0 && totalCount > articleMaxCount) {
                        break;
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
}

5、WebDriverUtils的代码。

package com.zxj.reptile.utils;

import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
 * Created by deng on 2017/5/16.
 */
public class WebDriverUtils {
    private static final int driver_manage_timeouts = 8;

    public static WebDriver createWebDriver() {
        ChromeOptions options = new ChromeOptions();
        System.setProperty("webdriver.chrome.driver", PropertyUtils.getString("chrome.driver.path"));//chromedriver位置
        options.setBinary(PropertyUtils.getString("chrome.path"));//chrome位置
        Map<String, Object> prefs = new HashMap<>();
        prefs.put("profile.managed_default_content_settings.images", 2);//禁止下载加载图片
        options.setExperimentalOption("prefs", prefs);
        WebDriver driver = new ChromeDriver(options);
        driver.manage().timeouts().pageLoadTimeout(driver_manage_timeouts, TimeUnit.SECONDS);
        return driver;
    }

    public static WebDriver createWebDriver(String url) {
        WebDriver driver = createWebDriver();
        driver.get(url);
        sleep();
        return driver;
    }

    public static WebDriver getNewWebDriver(WebDriver driver, String url) {
        try {
            driver.quit();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        WebDriver newDriver = createWebDriver();
        newDriver.get(url);
        sleep();
        return newDriver;
    }

    //浏览器每次都用新的Tag
    public static void getNewTag(WebDriver driver, String url) {
        JavascriptExecutor executor = (JavascriptExecutor) driver;
        executor.executeScript("function createDoc(){var w = window.open(); w.document.open(); w.document.write('<h1>Hello World!</h1>'); w.document.close();}; createDoc();");
        sleep();
        driver.close();
        ArrayList<String> tabs = new ArrayList<>(driver.getWindowHandles());
        driver.switchTo().window(tabs.get(0)); //switches to new tab
        driver.get(url);
        sleep();
    }

    public static void sleep() {
        try {
            Thread.sleep(PropertyUtils.getInt("chrome.sleep"));
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static WebElement findEle(WebDriver driver, String cssSelector) {
        List<WebElement> elementList = driver.findElements(By.cssSelector(cssSelector));
        if (elementList == null || elementList.isEmpty()) {
            return null;
        } else {
            return elementList.get(0);
        }
    }

    public static void scrollBottom(WebDriver driver) {
        JavascriptExecutor executor = (JavascriptExecutor) driver;
        //移动到底端的。
        executor.executeScript("window.scrollTo(0,document.body.scrollHeight)");
        sleep();
        //移动到窗口的顶端。
//        executor.executeScript("window.scrollTo(0,0-document.body.scrollHeight)");
//        sleep();
        //移动到元素element对象的顶端与当前窗口的顶部对齐,scrollIntoView(默认是true),可加可不可加。
//        WebElement testElement=driver.findElement(By.cssSelector("#just-a-test-id"));
//        executor.executeScript("arguments[0].scrollIntoView();",testElement );
//        executor.executeScript("arguments[0].scrollIntoView(true);",testElement );
//        sleep();
        //移动到元素element对象的底端与当前窗口的底部对齐。
//        executor.executeScript("arguments[0].scrollIntoView(false);", testElement);
//        sleep();
    }
}

猜你喜欢

转载自blog.csdn.net/qq_36511401/article/details/102606797