jsoup爬虫工具超简单使用(记录)

简单入门案例

Document document = null;
try {
    
    
	URL sd = new URL("http://www.caas.cn/xwzx/yw/index.html");
    document = Jsoup.parse(sd, 10000);
	} catch (IOException e) {
    
    
    	 e.printStackTrace();
	}
  if(document == null)
  return;
  //接下来获取到了document对象，就等于获取了页面的所有信息

对Document的操作

通过Class获取一个元素节点
Elements list05 = document.getElementsByClass(“list05”)
返回一个数组Elements，本质是一个ArrayList
获取Elements的第几个元素
Element element = elements.get(index);
获取一个Element下的所有元素…第几个元素
Elements elements = element.children();
Element element = element.children(index);
判断某个元素的内容是否为空
Boolean dex = element.hasText()
获取元素内容
String s = element.text();
获取元素的html结构
String s = element.html();
选择子元素下某个标签
element.child(0).select(“div”)
获取元素的属性
String s = element.attr(“src”);

通过地址下载图片

//处理图片，将图片读取到目录
    private void dealImage(Element element,String imageName) {
    
    

        String imgUrl = element.select("img").attr("src").replace("../","");
        imgUrl="http://www.caas.cn/"+imgUrl;

        String path = "G:\\IntelliJ IDEA 2017.3.1\\hncs\\src\\main\\resources\\static\\"+imageName;


        downImages(path,imgUrl);


    }

    /**
     * 下载图片到指定目录
     *
     * @param filePath 文件路径
     * @param imgUrl   图片URL
     */
    public static void downImages(String filePath, String imgUrl) {
    
    

        // 写出的路径
        File file = new File(filePath);

        try {
    
    
            // 获取图片URL
            URL url = new URL(imgUrl);
            // 获得连接
            URLConnection connection = url.openConnection();
            // 设置10秒的相应时间
            connection.setConnectTimeout(10 * 1000);
            // 获得输入流
            InputStream in = connection.getInputStream();
            // 获得输出流
            BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
            // 构建缓冲区
            byte[] buf = new byte[1024];
            int size;
            // 写入到文件
            while (-1 != (size = in.read(buf))) {
    
    
                out.write(buf, 0, size);
            }
            out.close();
            in.close();
        } catch (MalformedURLException e) {
    
    
            e.printStackTrace();
        } catch (IOException e) {
    
    
            e.printStackTrace();
        }

    }

jsoup爬虫工具超简单使用(记录)

猜你喜欢