java爬虫，爬取网址、爬取视频、爬取图片

一、爬取网址

import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * java实现爬虫
 */
public class Robot {
    public static void main(String[] args) {
        URL url = null;
        URLConnection urlconn = null;
        BufferedReader br = null;
        PrintWriter pw = null;
//        String regex = "http://[\\w+\\.?/?]+\\.[A-Za-z]+";
        String regex = "https://[\\w+\\.?/?]+\\.[A-Za-z]+";//url匹配规则
        Pattern p = Pattern.compile(regex);
        try {
            url = new URL("https://www.cnblogs.com/peachh/p/9740229.html");//爬取的网址、这里爬取的是一个生物网站
            urlconn = url.openConnection();
            pw = new PrintWriter(new FileWriter("C:/SiteURL.txt"), true);//将爬取到的链接放到D盘的SiteURL文件中
            br = new BufferedReader(new InputStreamReader(
                    urlconn.getInputStream()));
            String buf = null;
            while ((buf = br.readLine()) != null) {
                Matcher buf_m = p.matcher(buf);
                while (buf_m.find()) {
                    pw.println(buf_m.group());
                }
            }
            System.out.println("爬取成功^_^");
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                br.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            pw.close();
        }
    }
}

View Code

二、爬取视频

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
 * 功能：爬取某姐的小视频
 * @author cxd
 *
 */
public class WebSpiderDemo1 {
 
    public static void main(String[] args) throws Exception {
 
        String source = "http://www.budejie.com/video/";
        String destDir = "C:/rob/";
 
        Map<String, String> urlMap = getUrlInSource(source);
 
        for (Map.Entry<String, String> entry : urlMap.entrySet()) {
            String title = entry.getKey();// 视频名称
            String url = entry.getValue();// 视频url
            File destFile = new File(destDir + title + ".mp4");
            download(url, destFile);
        }
    }
 
    /**
     * 通过视频的URL下载该视频并存入本地
     * 
     * @param url      视频的URL
     * @param destFile 视频存入的位置
     * @throws IOException
     */
    public static void download(String url, File destFile) throws IOException {
        URL videoUrl = new URL(url);
 
        InputStream is = videoUrl.openStream();
        FileOutputStream fos = new FileOutputStream(destFile);
 
        int len = 0;
        byte[] buffer = new byte[1024];
        while ((-1) != (len = is.read(buffer))) {
            fos.write(buffer, 0, len);
        }
        fos.flush();
 
        if (null != fos) {
            fos.close();
        }
 
        if (null != is) {
            is.close();
        }
    }
 
    /**
     * 获取视频的URL地址和视频名称存入hashMap
     * 
     * @param source
     * @return
     * @throws IOException
     */
    public static Map<String, String> getUrlInSource(String source) throws IOException {
 
        Map<String, String> hashMap = new HashMap<>();
 
        for (int index = 1; index <= 1; index++) { // 页数最大为50，自己玩嘛，就只爬取了一页。
            String pageUrl = source + index;
            URL url = new URL(pageUrl);
            InputStream is = url.openStream();
 
//            若遇到反爬机制则使用该方法将程序伪装为浏览器进行访问
//            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//            conn.setRequestMethod("GET");
//            conn.setRequestProperty("user-agent",
//                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
//            BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
 
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
 
            String info = null;
            String title = null;
            // 此处不要用==null进行判断，因为网页中有很多行都是null，否则会报java.lang.NullPointerException。
            for (int i = 0; i < 10000; i++) {
                info = br.readLine();
 
                if (null != info) {
                    String urlRegex = "data-mp4=\"(.*?\\.mp4)";
 
                    if (info.contains("data-title")) {
                        title = info;
                    }
 
                    Pattern pattern = Pattern.compile(urlRegex);
                    Matcher matcher = pattern.matcher(info);
                    if (matcher.find()) {
                        for (int j = 0; j <= matcher.groupCount(); j++) {
                            String tmp = matcher.group(j);
                            if (!tmp.startsWith("data-mp4=")) {
                                String videoTitle = getTitle(title.trim());
                                hashMap.put(videoTitle, tmp);
                            }
                        }
                    }
                }
            }
        }
        return hashMap;
    }
 
    /**
     * 清洗整理titile字符串，
     * 
     * @param info
     * @return
     */
    private static String getTitle(String info) {
 
        int len = info.length();
        String title = info.substring(12, len - 1);
        return title;
    }

}

View Code

三、爬取图片

java爬虫，爬取网址、爬取视频、爬取图片

猜你喜欢