从网页中取出想要的信息:
导入
<dependency>
<!-- jsoup HTML parser library @ http://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class gethtml {
public static void main(String[] args) throws Exception {
long start= System.currentTimeMillis();
String str_url="http://xzsp.bjfda.gov.cn/bfdaww/trends/trendsQueryAction!getXkzInfo.dhtml?zjbh=JY11106111223752&jym=SPJY37248";
Pattern p = Pattern.compile(">(13\\d{5}|15\\d{5}|18\\d{5}|147\\d{4})<");
String html = getHtml(str_url);
Document doc = Jsoup.parse(html);
Elements rows = doc.select("table").get(0).select("tr td ul");
if(rows.size()>0){
Element row = rows.get(1);
System.out.println("名称:" + row.select("li").get(1).text());
System.out.println("代码:" + row.select("li").get(3).text());
}
Document document = Jsoup.connect(str_url).get();
System.out.println("title==="+document.title());
Matcher m = p.matcher(html);
int num = 0;
while(m.find())
{
System.out.println("打印出的号码段落:"+m.group(1)+" 编号"+(++num));
}
System.out.println(num);
long end = System.currentTimeMillis();
System.out.println("花费的时间"+(end-start)+"毫秒");
}
public static String getHtml(String str_url) throws IOException{
URL url = new URL(str_url);
String content="";
StringBuffer page = new StringBuffer();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream(), "utf-8"));
while((content = in.readLine()) != null){
page.append(content);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return page.toString();
}
}