1 爬地址
https://you.ctrip.com/sight/Beijing1.html
2.直接干代码
# -*- coding: gbk -*-
import urllib.request
import re
import pymysql
config = {
'host':'127.0.0.1',
'port':3306,#MySQL默认端口
'user':'root',#mysql默认用户名
'password':'root',
'db':'test',#数据库
'charset':'utf8',
}
course={}
# 打开数据库连接
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for j in range(0, 336):
# 主页URL
baseUrl = "https://you.ctrip.com/sight/beijing1/s0-p"+str(j)+".html"
pagedata1 = urllib.request.urlopen(baseUrl).read().decode("utf-8", "ignore")
nameUrlPat = '<dt>\n.*?<a target="_blank" href="(.*?)" title=".*?">(.*?)</a>'
# 提取景点的名称
nameUrl = re.compile(nameUrlPat, re.S).findall(pagedata1)
# 插库
for i in range(0, len(nameUrl)):
db = pymysql.connect(**config)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
#print(nameUrl[i][1])
sql="insert into tb_scenic_spot(name) values('%s')"%nameUrl[i][1]
#sql = "selsect id from tb_scenic_spot"
# SQL 插入语句
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
except:
# 如果发生错误则回滚
db.rollback()
# 关闭数据库连接
db.close()
3.结果