http://www.cbrc.gov.cn/chinese/jrjg/index.html
爬取所有银行的银行名称和官网地址(如果没有官网就忽略),并写入数据库;
import re
from urllib.request import urlopen
from urllib import request
import pymysql
def getbank():
url = 'http://www.cbrc.gov.cn/chinese/jrjg/index.html'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
req = request.Request(url, headers={'User-Agent':user_agent})
content = urlopen(req).read().decode('utf-8')
print("正在爬取地址")
pattern = r'<a href="(?P<bankaddress>.+)" target="_blank" style="color:#08619D">\s*(?P<bankname>[\u4e00-\u9fa5]+)\s*</a>'
findbankadd = re.findall(pattern,content)
print(findbankadd)
conn = pymysql.connect(user='root',
password='971203', charset='utf8', autocommit=True)
cur = conn.cursor()
conn.select_db('bank')
create_sql = 'create table bankaddress1 (银行地址 varchar(500) not null , 银行名字 varchar(500) not null );'
cur.execute(create_sql)
for i in findbankadd:
insert_sqli1 = 'insert into bankaddress1 (银行地址,银行名字) VALUES ("%s","%s");' %(i[0],i[1])
cur.execute(insert_sqli1)
cur.close()
conn.close()
getbank()
- 爬取猫眼电影TOP100(http://maoyan.com/board/4?offset=90)
1). 爬取内容: 电影名称,主演, 上映时间,图片url地址保存到mariadb数据库中;
2). 所有的图片保存到本地/mnt/maoyan/电影名.png
import re
from urllib.request import urlopen
from urllib import request
import pymysql
def getmovies():
for i in range(10):
url = 'http://maoyan.com/board/4?offset=%d' %(i*10)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
req = request.Request(url, headers={'User-Agent':user_agent})
content = urlopen(req).read().decode('utf-8')
print("正在爬取地址")
pattern = r'<img data-src="(?P<picture>.+)" alt="(?P<name>[\u4e00-\u9fa5]+)" class="board-img" />'
movies = re.findall(pattern,content)
print(movies)
pattern2 = r'<p class="star">\s*(.+)\s*</p>'
star = re.findall(pattern2,content)
print(star)
pattern3 = r'<p class="releasetime">(.+)</p>'
time = re.findall(pattern3,content)
print(time)
conn = pymysql.connect(user='root',
password='971203', charset='utf8', autocommit=True)
cur = conn.cursor()
conn.select_db('bank')
# create_sql = 'create table movies2 (电影名字 varchar(60) not null , 主演 varchar(200) not null , 上映时间 varchar(50) not null , 图片url varchar(200) not null );'
# cur.execute(create_sql)
num = len(movies)
for i in range(num):
insert_sqli1 = 'insert into movies2 (电影名字,主演,上映时间,图片url) VALUES ("%s","%s","%s","%s");' % (movies[i][1],star[i],time[i],movies[i][0])
cur.execute(insert_sqli1)
cur.close()
conn.close()
for i in movies:
url = i[0]
content = urlopen(url).read()
with open('movies/%s.jpg' %(i[1]), 'wb+') as f:
f.write(content)
getmovies()