昨天晚上完成了网页的下载,暂时不用和heritrix打交道了,有空我要好好研究下它的代码,现在没那么多时间。
今天对htmlparser有了初步了解,并自己写了一个简单的可以提取出网页中图片的url的小程序
package test; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class Extractor { private String outputPath; private String inputPath; private Parser parse; public String getOutputPath() { return outputPath; } public void setOutputPath(String outputPath) { this.outputPath = outputPath; } public String getInputPath() { return inputPath; } public void setInputPath(String inputPath) { this.inputPath = inputPath; } public Parser getParse() { return parse; } public void setParse(Parser parse) { this.parse = parse; } public static void main(String args[]) { Extractor ex = new Extractor(); ex.setInputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html"); ex.setOutputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/"); try { ex.setParse(new Parser("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html")); ex.extract(); } catch (ParserException e) { e.printStackTrace(); } } public void extract(){ NodeFilter pic_filter = new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "series_sy_intro_pic")); NodeFilter Attribute_filter = new AndFilter(new TagNameFilter("td"), new AndFilter(new HasAttributeFilter("class", "bor1_c1"), new HasAttributeFilter("style", "padding:5px;"))); try { this.getParse().setEncoding("gb2312"); NodeList pic_nodes =this.getParse().parse(pic_filter); System.out.println("a"); TableColumn tc = (TableColumn) pic_nodes.elementAt(0); ImageTag it = (ImageTag)(tc.childAt(1).getChildren().elementAt(0)); String imgURL = it.getImageURL(); System.out.println(imgURL); BufferedWriter bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath()+"aa.txt"))); bw.write(imgURL); bw.flush(); // for(int i=0;i<pic_nodes.size();i++){ // // } // NodeList atr_nodes = this.getParse().parse(Attribute_filter); // } catch (ParserException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
过节,休息下,明天继续..