# -*- coding:utf-8 -*-
# urllib库
from urllib.request import urlopen
# 从urllib库的requests模块导入urlopen函数
html = urlopen("http://pythonscraping.com/pages/page1.html")
# 抓取信息
print(html.read())
# 读取信息并打印
# BeautifulSoup库
from bs4 import BeautifulSoup
# 从bs4库加载BeautifulSoup模块
from urllib.request import urlopen
# 从urllib库的requests模块导入urlopen函数
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
# 抓取信息
bsObj = BeautifulSoup(html.read())
# 分析读取的信息并打印
print(bsObj.h1)
# 打印html->body->h1标签
# 网页在服务器上不存在的异常
try:
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
except HTTPError as e:
print(e)
# 返回空值,中断程序,或者执行另一个方案
else:
# 程序继续。注意:如果你已经在上面异常捕捉那一段代码里返回或中断(break),
# 那么就不需要使用else语句了,这段代码也不会执行
# 判断抓取链接网页的返回是否为空
if html is None:
print("URL is not found")
else:
# 程序继续
try:
badContent = bsObj.nonExistingTag.anotherTag
except AttributeError as e:
print("Tag was not found")
else:
if badContent == None:
print("Tag was not found")
else:
print(badContent)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url): # 获取网页标题的函数
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read())
title = bsObj.body.h1
except AttributeError as e:
return None
return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
print("Title could not be found")
else:
print(title)
# 遍历单个域名
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html)
for link in bsObj.findAll("a"):
if 'href' in link.attrs:
print(link.attrs['href'])
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime import random
import re
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html)
return bsObj.find("div", {"id":"bodyContent"}).findAll("a",
href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
# 链接去重
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
global pages
html = urlopen("http://en.wikipedia.org"+pageUrl)
bsObj = BeautifulSoup(html)
for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
# 我们遇到了新页面
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks("")
编写Spider.py
猜你喜欢
转载自blog.csdn.net/zhangyu4863/article/details/80779469
今日推荐
周排行