- 写csv文件
- 抓取页面图片①
- 抓取页面图片②
- 为爬虫添加代理ip
- 获取页面内嵌链接
- 字典的相关用法
August 31, 2017 8:36 AM
写csv文件
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html,"lxml")
#主对比表格是当前页面的第一个表格
table = bsObj.findAll("table",{"class":"wikitable"})[0]
rows = table.findAll("tr")
csvFile = open("editors.csv","wt",newline = '',encoding = 'utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td','th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
抓取页面图片①
import urllib.request
response = urllib.request.urlopen('http://imgsrc.baidu.com/forum/w%3D580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg')
cat_img = response.read()
with open('picture.jpg','wb')as f:
f.write(cat_img)
抓取页面图片②
import urllib.request
import re
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
html = getHtml("http://tieba.baidu.com/p/2460150866")
print(getImg(html))
为爬虫添加代理ip
import urllib.request
import random
url = 'http://whatismyip.com.tw'
iplist = ['121.201.97.136:80','117.135.164.170:80','58.247.31.230:80']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)
#获取页面内嵌链接
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
rawtext=urlopen("http://bbs.gfan.com/android-8397839-1-1.html").read()
soup = BeautifulSoup(rawtext,"html.parser")
targetDiv=soup.find('div',{'class':'pg'})
catalogLinks=targetDiv.find_all('a')
indexlist = []
for l in catalogLinks[1:]:
indexlist.append(l.get('href'))
for index in indexlist:
print(index)
字典的相关用法
test = {
"post": {
"content": ""
},
"replys": [
{
"content": ""
}
]
}
test["post"]["content"] = "xx"
test["replys"][0]["content"] = "yy"
test["replys"][0]["value"] = "zz"
test["replys"].append({"content":"","title":"","publish_date":""})
def store(measurements):
import json
with open('measurements.json', 'w') as f:
f.write(json.dumps(test))
if __name__ == "__main__":
store(test)