package cn.hanquan.file;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class UrlCrawBoke {
public static String MYURL = "https://blog.csdn.net/zhhp1001/article/details/89352869";
public static String PATH = System.getProperty("user.dir");
public static void main(String urlstr[]) throws IOException {
List<String> urls = new ArrayList<String>();
StringBuilder url = new StringBuilder(MYURL);
int begin = url.lastIndexOf("csdn.net/");
begin += 9;
int end = url.indexOf("/", begin);
if (end <= begin) {
end = url.indexOf("?", begin);
if (end <= begin) {
end = url.length();
}
}
String userId = url.substring(begin, end);
if (userId.equals("blog")) {
begin = url.lastIndexOf("blog/");
begin += 5;
end = url.indexOf("/", begin);
if (end <= begin)
end = url.indexOf("?");
userId = url.substring(begin, end);
}
System.out.println("用户名: " + userId);
final String homeUrl = "https://blog.csdn.net/" + userId + "/article/list/";
int totalPage = 0;
InputStream is;
String pageStr;
StringBuilder curUrl = null;
for (int i = 1; i < 100; i++) {
curUrl = new StringBuilder(homeUrl);
curUrl.append(i);
System.out.println(curUrl);
is = doGet(curUrl.toString());
pageStr = inputStreamToString(is, "UTF-8");
PrintWriter pw = new PrintWriter(new FileWriter(PATH + "/Page" + i + ".txt"), true);
pw.print(pageStr);
pw.close();
File filename = new File(PATH + "/Page" + i + ".txt");
InputStreamReader reader = new InputStreamReader(new FileInputStream(filename));
BufferedReader br = new BufferedReader(reader);
System.out.println("正在获取第" + i + "页...");
String line = null;
while (true) {
line = br.readLine();
if (line == null || line.indexOf("</main>") != -1)
break;
else {
if (line.indexOf("http") != -1 && line.indexOf("/details/") != -1 && line.indexOf("yoyo_liyy") == -1
&& line.indexOf(userId) != -1) {
String tempUrl = new String(line);
tempUrl = tempUrl.trim();
tempUrl = tempUrl.substring(tempUrl.indexOf("http"), tempUrl.indexOf("/details/") + 17);
if (urls.size() == 0 || (urls.size() != 0 && !(urls.get(urls.size() - 1).equals(tempUrl)))) {
urls.add(tempUrl);
System.out.println(tempUrl);
}
}
}
}
br.close();
if (pageStr.lastIndexOf("空空如也") != -1) {
System.out.println("No This Page!");
break;
} else {
System.out.println("Success~");
}
totalPage = i;
}
System.out.println("总页数为: " + totalPage);
for (int i = 0; i < urls.size(); i++) {
doGet(urls.get(i));
System.out.println("成功访问第" + (i + 1) + "个链接,共" + urls.size() + "个:" + urls.get(i));
}
System.out.println("运行完毕,成功增加访问数:" + urls.size());
}
public static InputStream doGet(String urlstr) throws IOException {
URL url = new URL(urlstr);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
InputStream inputStream = conn.getInputStream();
return inputStream;
}
public static String inputStreamToString(InputStream is, String charset) throws IOException {
byte[] bytes = new byte[1024];
int byteLength = 0;
StringBuffer sb = new StringBuffer();
while ((byteLength = is.read(bytes)) != -1) {
sb.append(new String(bytes, 0, byteLength, charset));
}
return sb.toString();
}
}
package cn.hanquan.file;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UrlCrawBoke {
static String userId = "sinat_42483341";
public static void main(String urlstr[]) throws IOException, InterruptedException {
Set<String> urls = new HashSet<String>();
final String homeUrl = "https://blog.csdn.net/" + userId + "/article/list/";
int totalPage = 0;
InputStream is;
String pageStr;
StringBuilder curUrl = null;
for (int i = 1; i < 100; i++) {
Thread.sleep(1000);
System.out.println("finding page " + i);
curUrl = new StringBuilder(homeUrl);
curUrl.append(i);
System.out.println(curUrl);
is = doGet(curUrl.toString());
pageStr = inputStreamToString(is, "UTF-8");
List<String> list = getMatherSubstrs(pageStr, "(?<=href=\")https://blog.csdn.net/" + userId + "/article/details/[0-9]{8,9}(?=\")");
urls.addAll(list);
if (pageStr.lastIndexOf("空空如也") != -1) {
System.out.println("No This Page!");
break;
} else {
System.out.println("Success~");
}
totalPage = i;
}
System.out.println("总页数为: " + totalPage);
System.out.println("打印每个链接");
for (String s:urls) {
System.out.println(s);
}
System.out.println("打印每个链接完毕");
int i=0;
for (String s:urls) {
doGet(s);
System.out.println("成功访问第" + (++i) + "个链接,共" + urls.size() + "个:" + s);
}
System.out.println("运行完毕,成功增加访问数:" + urls.size());
}
public static InputStream doGet(String urlstr) throws IOException {
URL url = new URL(urlstr);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
InputStream inputStream = conn.getInputStream();
return inputStream;
}
public static String inputStreamToString(InputStream is, String charset) throws IOException {
byte[] bytes = new byte[1024];
int byteLength = 0;
StringBuffer sb = new StringBuffer();
while ((byteLength = is.read(bytes)) != -1) {
sb.append(new String(bytes, 0, byteLength, charset));
}
return sb.toString();
}
public static List<String> getMatherSubstrs(String str, String regex) {
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
while (m.find()) {
list.add(m.group());
}
return list;
}
}