读取html网页中的内容,可自定义去除html标签

工具类:
public class HtmlUtils {

/**
 * 获取HTML代码里的内容
 * 
 * @param htmlStr
 * @return
 */
public static String getTextFromHtml(String htmlStr) {
	// 去除html标签
	htmlStr = delHtmlTags(htmlStr);
	// 去除空格" "
	/* htmlStr = htmlStr.replaceAll(" ",""); */
	return htmlStr;
}

public static String readHtml(String path) {
	File file = new File(path);
	try (RandomAccessFile accessFile = new RandomAccessFile(file, "r"); FileChannel fcin = accessFile.getChannel();) {
		Charset charset = Charset.forName("UTF-8");
		int bufSize = 100000;
		ByteBuffer rBuffer = ByteBuffer.allocate(bufSize);
		String enterStr = "\n";
		byte[] bs = new byte[bufSize];
		StringBuilder strline = new StringBuilder("");
		StringBuilder strBuf = new StringBuilder("");
		while (fcin.read(rBuffer) != -1) {
			int rSize = rBuffer.position();
			rBuffer.rewind();
			rBuffer.get(bs);
			rBuffer.clear();
			String tempString = new String(bs, 0, rSize, charset);
			tempString = tempString.replaceAll("\r", "");

			int fromIndex = 0;
			int endIndex = 0;
			while ((endIndex = tempString.indexOf(enterStr, fromIndex)) != -1) {
				String line = tempString.substring(fromIndex, endIndex);
				line = strBuf.toString() + line;
				strline.append(line.trim());

				strBuf.delete(0, strBuf.length());
				fromIndex = endIndex + 1;
			}
			if (rSize > tempString.length()) {
				strline.append(tempString.substring(fromIndex, tempString.length()));
				strBuf.append(tempString.substring(fromIndex, tempString.length()));
			} else {
				strline.append(tempString.substring(fromIndex, rSize));
				strBuf.append(tempString.substring(fromIndex, rSize));
			}
		}
		/* System.out.println(strline.toString().replaceAll("\"", "'")); */
		// 读取html内容
		String html = StringEscapeUtils.unescapeHtml(strline.toString());
		// 提取<body>中的内容,去除标签
		String stringHtml = getTextFromHtml(html);
		return stringHtml;
	} catch (Exception e) {
		e.getMessage();
		return "";
	}
}
/**
 * 去除html代码中含有的标签
 * 
 * @param htmlStr
 * @return
 */
public static String delHtmlTags(String htmlStr) {
	// 定义script的正则表达式,去除js可以防止注入
	String scriptRegex = "<script[^>]*?>[\\s\\S]*?<\\/script>";
	// 定义style的正则表达式,去除style样式,防止css代码过多时只截取到css样式代码
	String styleRegex = "<style[^>]*?>[\\s\\S]*?<\\/style>";
	// 去除所有标签,只剩div|br|span|p|
	String regexstr = "<(?!div|br|span|p|/p).*?>";
	// 定义空格,回车,换行符,制表符

	// 过滤script标签
	htmlStr = htmlStr.replaceAll(scriptRegex, "");
	// 过滤style标签
	htmlStr = htmlStr.replaceAll(styleRegex, "");
	// 过滤html标签
	/* htmlStr = htmlStr.replaceAll(htmlRegex, ""); */
	// 过滤空格等
	/* htmlStr = htmlStr.replaceAll(spaceRegex, ""); */
	htmlStr = htmlStr.replaceAll(regexstr, "");
	return htmlStr.trim(); // 返回文本字符串
}

}

测试:
public class Test00 {

public static void main(String[] args) {
	String html = HtmlUtils.readHtml("D:\\tess.html" );
	System.out.println(html);
}

}
读取的html内容:
在这里插入图片描述
控制台打印结果:
在这里插入图片描述

发布了11 篇原创文章 · 获赞 12 · 访问量 4126

猜你喜欢

转载自blog.csdn.net/qq_38991369/article/details/94573421