今天想保存网页上的数学公式,图片好多保存好麻烦,所以…请看整活!!!
Java简单小爬虫
运用知识:URL类、数据流、正则、文件
package blank;
/**
* @author blank
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider {
/**
* 获取网站内容
*
* @param urlStr
* @param charset
* @return
* @throws IOException
*/
public static String getURLContent(String urlStr, String charset) throws IOException {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));
StringBuilder sb = new StringBuilder();
String temp = "";
while ((temp = reader.readLine()) != null) {
sb.append(temp);
}
reader.close();
return sb.toString();
}
/**
* 保存图片
*
* @param urlStr
* @throws IOException
*/
public static void savePic(String urlStr, String dirStr) throws IOException {
URL url = new URL(urlStr);
InputStream in = url.openStream();
String filename = dirStr + urlStr.substring(urlStr.lastIndexOf("/") + 1);
File dir = new File(dirStr);
if (!dir.exists()) {
dir.mkdirs();
}
System.out.println(filename);
File file = new File(filename);
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream fos = new FileOutputStream(file);
byte[] b = new byte[1024];
int len;
while ((len = in.read(b)) != -1) {
fos.write(b, 0, len);
}
fos.close();
in.close();
}
public static void main(String[] args) throws IOException {
// 获取网页
String urlContent = getURLContent("http://video.kaola100.com/yhcy/yhff/12322", "utf-8");
// 准备正则
Pattern p = Pattern.compile("<img(.+?)/>");
// 匹配并处理
Matcher m = p.matcher(urlContent);
int count = 0;
while (m.find()) {
String temp = m.group(1);
temp = "http://video.kaola100.com" + temp.substring(temp.indexOf("\"/") + 1, temp.lastIndexOf("\""));
System.out.println("下载" + temp + "...");
savePic(temp, "D:/math/");
count++;
}
System.out.println("共保存" + count + "个图片");
}
}
好活!!!运行即保存,但这代码并不具有通用性,对于其他网页的处理上需要进行稍加改动,此处不予赘述~
运行后效果