网上想找个 4级行政地址,全都要付款,我想了下自己拿吧。
该代码不依赖任何爬虫框架,就是自己做字符串判断。
做到了省/市/区县/街镇,第五级没有爬取 感觉用不上,想要也可以自己接着弄一层。注意第4层数据量已经到了4万了,要分批取数了,不能一次性读sql了。
如果不想执行代码的,我把sql和数据放上来,下载地址;
建表语句如下:
CREATE TABLE `mall_areabb` (
`area_id` bigint NOT NULL AUTO_INCREMENT,
`area_name` varchar(50) DEFAULT NULL,
`parent_id` bigint DEFAULT NULL,
`level` int DEFAULT NULL,
`path` varchar(200) DEFAULT NULL,
PRIMARY KEY (`area_id`),
KEY `parent_id` (`parent_id`) COMMENT '上级id'
) ENGINE=InnoDB AUTO_INCREMENT=659011502001 DEFAULT CHARSET=utf8mb3;
代码如下:
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.mysql.cj.jdbc.MysqlDataSource;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.jdbc.core.JdbcTemplate;
/**
*爬取国家行政地址信息
* @author andy.wang
* @时间 2022年03月17日
*/
public class Plzl {
private static String domainName="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
public static void main(String[] args) throws Exception {
String s=domainName+"index.html";
Plzl p = new Plzl();
//第一层
Map<String,String> m = p.requestByGetMethod(s,"<a##</a>");
MysqlDataSource dataSource = new MysqlDataSource();
dataSource.setUrl("jdbc:mysql://localhost:3306/dashu?useUnicode=true&characterEncoding=utf8&serverTimezone=Asia/Shanghai&nullCatalogMeansCurrent=true");
dataSource.setUser("root");
dataSource.setPassword("123456");
JdbcTemplate jdbcTemplate = new JdbcTemplate(dataSource);
insertDataOne(m,jdbcTemplate);
//第二层
List<Map<String,Object>> columns = jdbcTemplate.queryForList(" select * from mall_areabb where level=1 ");
for (int i=0;i<columns.size() ;i++) {
Map<String,Object> two=columns.get(i);
String path =two.get("path").toString();
Map<String,String> twoMap = p.requestByGetMethod(domainName+path,"<a##</a>");
insertDataTwo(twoMap, jdbcTemplate,2,Long.valueOf(two.get("area_id").toString()));
// System.out.println(twoMap.toString());
}
columns = jdbcTemplate.queryForList(" select * from mall_areabb where level=2 ");
for (int i=0;i<columns.size() ;i++) {
Map<String,Object> two=columns.get(i);
String path =two.get("path").toString();
Map<String,String> twoMap = p.requestByGetMethod(domainName+path,"<a##</a>");
insertDataThree(twoMap, jdbcTemplate,3,Long.valueOf(two.get("area_id").toString()),path.substring(0,path.indexOf("/")+1));
// System.out.println(twoMap.toString());
}
columns = jdbcTemplate.queryForList(" select * from mall_areabb where level=3 ");
for (int i=0;i<columns.size() ;i++) {
Map<String,Object> two=columns.get(i);
String path =two.get("path").toString();
Map<String,String> twoMap = p.requestByGetMethod(domainName+path,"<a##</a>");
insertDataThree(twoMap, jdbcTemplate,4,Long.valueOf(two.get("area_id").toString()),path.substring(0,path.lastIndexOf("/")+1));
// System.out.println(twoMap.toString());
}
}
public static void insertDataOne(Map<String,String> m, JdbcTemplate t){
String sql =" insert into mall_areabb(area_id,area_name,parent_id,level,path) values(?,?,0,1,?);";
for (String k :m.keySet()) {
Object[] o =new Object[3];
o[0]=Long.valueOf(k.substring(0,k.indexOf("."))+"0000000000");
o[1]=m.get(k);
o[2]=k;
t.update(sql,o);
}
}
public static void insertDataTwo(Map<String,String> m, JdbcTemplate t,int level,Long pid){
String sql =" insert into mall_areabb(area_id,area_name,parent_id,level,path) values(?,?,?,"+level+",?)";
List<Object[]> list =new ArrayList<>();
for (String k :m.keySet()) {
Object[] o =new Object[4];
String[] v =m.get(k).split(":");
o[0]=Long.valueOf(v[0]);
o[1]=v[1];
o[2]=pid;
o[3]=k;
list.add(o);
}
t.batchUpdate(sql,list);
}
public static void insertDataThree(Map<String,String> m, JdbcTemplate t,int level,Long pid,String path){
String sql =" insert into mall_areabb(area_id,area_name,parent_id,level,path) values(?,?,?,"+level+",?)";
List<Object[]> list =new ArrayList<>();
for (String k :m.keySet()) {
Object[] o =new Object[4];
String[] v =m.get(k).split(":");
o[0]=Long.valueOf(v[0]);
o[1]=v[1];
o[2]=pid;
o[3]=path+k;
list.add(o);
}
t.batchUpdate(sql,list);
}
private CloseableHttpClient getHttpClient(){
return HttpClients.createDefault();
}
private void closeHttpClient(CloseableHttpClient client) throws IOException{
if (client != null){
client.close();
}
}
/**
* 通过GET方式发起http请求
*/
public Map<String,String> requestByGetMethod(String url,String key){
Map<String,String> map=new HashMap<>();
//创建默认的httpClient实例
CloseableHttpClient httpClient = getHttpClient();
try {
//用get方法发送http请求
HttpGet get = new HttpGet(url);
CloseableHttpResponse httpResponse = null;
//发送get请求
httpResponse = httpClient.execute(get);
try{
//response实体
HttpEntity entity = httpResponse.getEntity();
if (null != entity){
String s =EntityUtils.toString(entity,"utf-8");
map = read(s,key);
}
}
finally{
httpResponse.close();
}
} catch (Exception e) {
e.printStackTrace();
}
finally{
try{
closeHttpClient(httpClient);
} catch (IOException e){
e.printStackTrace();
}
}
return map;
}
private Map<String,String> read(String s,String key) throws IOException{
if(s==null||s.length()<10){
return null;
}
Map<String,String> m =new HashMap<>();
// 以自定编码的方式读取输入流
if (s.lastIndexOf("content")>0) {
s =s.substring(s.lastIndexOf("<table"),s.indexOf("</table>"));
String[] keys = key.split("##");
while (true){
Integer count = s.indexOf(keys[1]);
if(s.indexOf(keys[0])<0){
break;
}
String ss =s.substring(s.indexOf(keys[0]),count);
String path=ss.substring(ss.indexOf("href=\"")+6,ss.indexOf("\">"));
String name =delHTMLTag(ss);
if(m.get(path)==null){
m.put(path,name);
}else {
m.put(path,m.get(path)+":"+name);
}
s=s.substring(count+5);
}
}
return m;
}
public String delHTMLTag(String htmlStr){
String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式
String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式
String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式
Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
Matcher m_script=p_script.matcher(htmlStr);
htmlStr=m_script.replaceAll(""); //过滤script标签
Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
Matcher m_style=p_style.matcher(htmlStr);
htmlStr=m_style.replaceAll(""); //过滤style标签
Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
Matcher m_html=p_html.matcher(htmlStr);
htmlStr=m_html.replaceAll(""); //过滤html标签
return htmlStr.trim(); //返回文本字符串
}
}