htmlCleaner以及dom4j的应用实例

   最近项目中涉及了这样一种情况,需要从其他网站中拔取一些信息,然后将获取的信息加以整合和筛选,保存到xml文件中。
    下面的例子是一个完整的实例,希望可以帮助到需要的人:
package com.linkage.cn.htmlclear.test;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;

import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import org.dom4j.tree.BaseElement;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
/**
*
* @author dengbin
*
*/
public class HtmlClearTest {
    /**
     * 在指定的年月日的条件下,可以获取最多url个数。
     */
    private static final int READ_NUM = 2000;
    /**
     * 月份的特殊分界线。
     */
    private static final int signalMonth = 10;
    /**
     * URL的特殊分界线。
     */
    private static final int signal10NUM = 10;
    /**
     * URL的特殊分界线。
     */
    private static final int signal100NUM = 10;
   
    /**
     * URL终端最多个数。
     */
    private static final int DEPARNUM = 20;
    /**
     * 根据不同的年月获取不同的数据。
     *
     * @param startYear
     *        int
     * @param startMonth
     *        int
     * @param endYear
     *        int
     * @param endMonth
     *        int
     */
    private void changeUrl(int startYear, int startMonth, int endYear, int endMonth) {

        for (int starYear = startYear; starYear <= endYear; starYear++) {

            label: month: for (int starMonth = startMonth; starMonth <= endMonth;) {
                int k = 0;
                for (int i = 1; i <= READ_NUM;) {
                    StringBuffer sbUrl = new StringBuffer();
                    if (starMonth < signalMonth) {
                        if (i < signal10NUM) {
                            sbUrl.append("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-")
                                    .append(starYear).append("0").append(starMonth).append("-").append("00").append(i);

                        } else if (i < signal100NUM) {
                            sbUrl.append("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-")
                                    .append(starYear).append("0").append(starMonth).append("-").append("0").append(i);
                        } else {
                            sbUrl.append("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-")
                                    .append(starYear).append("0").append(starMonth).append("-").append(i);
                        }
                    } else {
                        if (i < signal10NUM) {
                            sbUrl.append("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-")
                                    .append(starYear).append(starMonth).append("-").append("00").append(i);

                        } else if (i < signal100NUM) {
                            sbUrl.append("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-")
                                    .append(starYear).append(starMonth).append("-").append("0").append(i);
                        } else {
                            sbUrl.append("http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-")
                                    .append(starYear).append(starMonth).append("-").append(i);
                        }
                    }

                    URL strUrl = null;
                    try {
                        strUrl = new URL(sbUrl.toString());
                        /**
                         * 构造HtmlCleaner实例
                         */
                        HtmlCleaner htmlClear = new HtmlCleaner();
                       
                        //根据url读取网页,注意这里中文易出现乱码,所以可以设置编码格式。
                       
                        TagNode rootNode = htmlClear.clean(strUrl, "UTF-8");

                        /**
                         * 根据class属性为dispalyitem,得到tagNode数组
                         */
                        TagNode[] classNode = rootNode.getElementsByAttValue("class", "displayitem", true, false);
                        if (classNode == null || classNode.length == 0) {
                            k++;
                        } else {
                            //System.out.println(startYear + "--" + starMonth + "--" + i);
                            /**
                             * 根据class属性为details,得到tagNode数组
                             */
                            TagNode[] tableNode = rootNode.getElementsByAttValue("class", "details", true, false);
                            TagNode[] tdNode = tableNode[0].getElementsByName("td", true);
                           
                            writeToXml(classNode, tdNode[15]);
                            k = 0;
                        }
                        i++;
                        if (k > DEPARNUM) {
                            starMonth++;
                            continue month;
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                    }

                }
            }
        }

    }

    /**
     * 使用dom4j读写xml文件。
     *
     * @param classNode
     *        TagNode[]
     * @param tdNode
     *        TagNode
     * @throws DocumentException
     *         Exception
     */
    public void writeToXml(TagNode[] classNode, TagNode tdNode) throws DocumentException {

        File file = new File("vuln.xml");

        //读取已存在xml
        Document document = new SAXReader().read("vuln.xml");

       
        Element root = document.getRootElement();//获得根节点
        /**
         * 构造数据,添加到已存在的文件中
         */
        Element vulnE = new BaseElement("VULN_DESC");
        vulnE.addElement("VULN_NAME").addText(changeNodeText(classNode[0]));
        vulnE.addElement("VULN_CNND_NUM").addText(changeNodeText(classNode[1]));
        vulnE.addElement("VULN_DAMAGE").addText(changeNodeText(classNode[4]));//文中出现的这些4,5,6之类的数字。为了方便期间没有做解释
        vulnE.addElement("VULN_OS").addText(changeNodeText(classNode[5]));
        vulnE.addElement("VULN_TYPE").addText(changeNodeText(classNode[6]));

        vulnE.addElement("VULN_CVE_NUM").addText(changeNodeText(tdNode));
        vulnE.addElement("VULN_CONTENT").addText(changeNodeText(classNode[classNode.length - 2]));
        vulnE.addElement("VULN_SOLUTION").addText(changeNodeText(classNode[classNode.length - 1]));
        root.add(vulnE);
       
        //写入到xml文件

        try {
            FileOutputStream fos = new FileOutputStream(file);
            //设置编码格式。主要是针对中文乱码问题
            OutputFormat of = OutputFormat.createPrettyPrint();
            // 输出为GBK码解决在windows下某些系统下打开含有中文xml乱码的情况  
            of.setEncoding("UTF-8");
            XMLWriter xw = new XMLWriter(fos, of);
            xw.write(document);
            xw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
    /**
     * 对网页中的一些特殊字符做处理,主要是空格,以及&nbsp;.
     * @param classNode
     *        TagNode
     * @return
     *        String
     *       
     */
    private String changeNodeText(TagNode classNode) {
        return classNode.getText().toString().replaceAll("&nbsp;", "").trim();
    }

    /**
     * 调用方法。
     * @param args
     *        String[]
     */
    public static void main(String[] args) {
        /**
         * 设置代理。其实代理涉及的内容挺多的。下一结做讨论
         */
        System.setProperty("socksProxyHost", "127.0.0.1");
        System.setProperty("socksProxyPort", "6888");
       
        HtmlClearTest hTest = new HtmlClearTest();
        hTest.changeUrl(1996, 1, 2011, 11);

    }

}


如果还有什么问题,或者更好的建议,希望我们共同成长!

猜你喜欢

转载自dengbinbin.iteye.com/blog/1198664