写csv文件

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html,"lxml")
#主对比表格是当前页面的第一个表格
table = bsObj.findAll("table",{"class":"wikitable"})[0]
rows = table.findAll("tr")

csvFile = open("editors.csv","wt",newline = '',encoding = 'utf-8')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td','th']):
            csvRow.append(cell.get_text())
            writer.writerow(csvRow)
finally:
    csvFile.close()

抓取页面图片①

import urllib.request
response = urllib.request.urlopen('http://imgsrc.baidu.com/forum/w%3D580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg')
cat_img = response.read()
with open('picture.jpg','wb')as f:
    f.write(cat_img)

抓取页面图片②

import urllib.request
import re

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1
html = getHtml("http://tieba.baidu.com/p/2460150866")
print(getImg(html))

为爬虫添加代理ip

import urllib.request
import random

url = 'http://whatismyip.com.tw'
iplist = ['121.201.97.136:80','117.135.164.170:80','58.247.31.230:80']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)

#获取页面内嵌链接
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen

rawtext=urlopen("http://bbs.gfan.com/android-8397839-1-1.html").read()
soup = BeautifulSoup(rawtext,"html.parser")
targetDiv=soup.find('div',{'class':'pg'})
catalogLinks=targetDiv.find_all('a')
indexlist = []
for l in catalogLinks[1:]:
    indexlist.append(l.get('href'))

for index in indexlist:
    print(index)

字典的相关用法

test = {
  "post": {
    "content": ""
  },
  "replys": [
    {
      "content": ""
    }
  ]
}

test["post"]["content"] = "xx"
test["replys"][0]["content"] = "yy"
test["replys"][0]["value"] = "zz"

test["replys"].append({"content":"","title":"","publish_date":""})
def store(measurements):
    import json
    with open('measurements.json', 'w') as f:
        f.write(json.dumps(test))

if __name__ == "__main__":
    store(test)

use_python

写csv文件

抓取页面图片①

抓取页面图片②

为爬虫添加代理ip

字典的相关用法

猜你喜欢