提取本地html文件标签(正则)

htmlPath是本地网页文件路径
解码要与网页一致
package Spider;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Doc {
	String htmlPath =null;
	StringBuilder sb=new StringBuilder();
	public void doc() throws Exception {
	File input = new File(htmlPath); 
	String regx="<a[\\s\\S]+/a>";
	BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(input),"utf-8"));
	String s="";
	while((s=br.readLine())!=null) {
		sb.append(s);
		}
	Pattern p =Pattern.compile(regx);
	Matcher matcher = p.matcher(sb);
	matcher.find();
	System.out.println(matcher.group());
	System.out.println("完毕");
	br.close();
    }
}

  

2019-05-16 02:05:07

猜你喜欢

转载自www.cnblogs.com/xybz/p/10873263.html