#encoding UTF-8 import urllib.request import http.cookiejar url = "http://www.baidu.com" print('第一种方法') response1 = urllib.request.urlopen(url) print(response1.getcode()) print(response1.read()) print('第二种方法') request = urllib.request.Request(url) request.add_header("user-agent",'Mozilla/5.0') response2 = urllib.request.urlopen(request) print(response2.getcode()) print(response2.read()) print('第三种方法') # 初始化一个CookieJar来处理Cookie cookie=http.cookiejar.CookieJar() #实例化一个全局opener handler=urllib.request.HTTPCookieProcessor(cookie) opener=urllib.request.build_opener(handler) # 获取cookie # 访问主页 自动带着cookie信息 result = opener.open('http://www.baidu.com') print(result.read()) print(cookie) print('-------------------------------------') from bs4 import BeautifulSoup import re html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') links = soup.find_all('a') print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&') for link in links: print(link['href']) print(link.get_text()) node = soup.find('a',href=re.compile(r'ill')) print(node['href']) print(node.get_text()) p_node = soup.find('p',class_="title") print(p_node.get_text())
python爬虫技术-beautifulsoup的应用
猜你喜欢
转载自blog.csdn.net/welun521/article/details/82861165
今日推荐
周排行