(1)正则爬虫
(1.1)打开网址
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen(
"https://morvanzhou.github.io/static/scraping/basic-structure.html"
).read().decode('utf-8')
print(html)
(1.2)正则匹配
import re
res = re.findall(r"<title>(.+?)</title>", html)
print("\nPage title is: ", res[0])
es = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL) # re.DOTALL if multi line
print("\nPage paragraph is: ", res[0])
res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)
(2)beautifulsoup
from bs4 import BeautifulSoup
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
print('\n', soup.p)
all_href = soup.find_all('a')
all_href = [l['href'] for l in all_href]
print('\n', all_href)
总结一下,网页的内容不懂,感觉里边应该是dict结构
(如果网页中有过个同样的 tag, 比如链接 <a>
, 我们可以使用 find_all()
来找到所有的选项. 因为我们真正的 link 不是在 <a>
中间 </a>
, 而是在 <a href="link">
里面, 也可以看做是 <a>
的一个属性. 我们能用像 Python 字典的形式, 用 key 来读取 l["href"]
.)
(3)CSS
扫描二维码关注公众号,回复:
2367086 查看本文章
HTML 和 CSS 是一对好搭档, 他们共同组成了当今的众多网页. 如果这个世界上没有 CSS, 你看到的所有网页可能都长得像这样. 特别”骨感”!
from bs4 import BeautifulSoup
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/list.html").read().decode('utf-8')
print(html)
<head> ...
###此部分为css <style> .jan { background-color: yellow; } ... .month { color: red; } </style> </head>
<body>
...
<ul>
<li class="month">一月</li>
<ul class="jan">
<li>一月一号</li>
<li>一月二号</li>
<li>一月三号</li>
</ul>
...
</ul>
</body>
(3.1)按照class匹配
soup = BeautifulSoup(html, features='lxml')
# use class to narrow search
month = soup.find_all('li', {"class": "month"})
for m in month:
print(m.get_text())
"""
一月
二月
三月
四月
五月
"""
jan = soup.find('ul', {"class": 'jan'})
d_jan = jan.find_all('li') # use jan as a parent
for d in d_jan:
print(d.get_text())
"""
一月一号
一月二号
一月三号
"""
(4)正则表达
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])
"""
https://morvanzhou.github.io/static/img/course_cover/tf.jpg
https://morvanzhou.github.io/static/img/course_cover/rl.jpg
https://morvanzhou.github.io/static/img/course_cover/scraping.jpg
"""
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
for link in course_links:
print(link['href'])
"""
https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/scraping
https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
"""