这里面beautifulsoup和xpath都有用到,还有csv保存和excel保存的方式都有,写入到excel使用的是openpyxl。
首先说我们这次看的这个网页是这么个网页 http://www.allitebooks.org/
import requests
import csv
from wutils import defNum
from bs4 import BeautifulSoup
from lxml import etree
import openpyxl
def resolver_page(data):
books=[]
soup=BeautifulSoup(data,'lxml')
xpath_data = etree.HTML(data)
titles=soup.select('h2[class="entry-title"] a[rel="bookmark"]')
for title in titles:
book=[]
book.append(title.text)
book.append(title.attrs.get('href'))
books.append(book)
synopsis = xpath_data.xpath('//h5[@class="entry-author"]')
for synopsi,book in zip(synopsis,books):
author = synopsi.xpath('.//a[@rel="tag"]/text()')
book.append(author)
content=xpath_data.xpath('//div[@class="entry-summary"]/p/text()')
#//a/@href 这个取属性值
for content,book in zip(content,books):
book.append(content)
return books
if __name__ == '__main__':
front_url='http://www.allitebooks.org/page/'
head_list={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
#fp=open('allbook.csv','w',encoding='utf-8')
***
## 还有这里啊,我试了很多次,就得先保存下来,你再操作,你才可以进行操作这个excel,还有一点就是,要是不这么写就会有报错说是cell不是已知的方法
***
# wb=openpyxl.Workbook("books.xlsx")
# wb.save('books.xlsx')
wb=openpyxl.load_workbook('books.xlsx')
ws=wb.create_sheet('book',1)#这个使用的是我们自己创建的一个工作簿
#ws=wb.active#这个出来的表使默认的工作簿,
#writer=csv.writer(fp)
#writer.writerow(['书名','链接','作者','简介'])
#ws.cell(row=1,column=1,value='书名')
ws.cell(1,1).value='书名'
ws.cell(1,2).value='链接'
ws.cell(1,3).value ='作者'
ws.cell(1,5).value ='简介'
results=[]
for index in range(1,2):
data=requests.get(url=front_url+str(index)+str('/'),headers=defNum.defNum().login_heards).content.decode()
result=resolver_page(data)
#writer.writerows(result)
#print(index)
results.extend(result)
# fp.close()
for ind in range(len(results)):
ws.cell(ind+2,1).value=results[ind][0]
ws.cell(ind+2,2).value=results[ind][1]
for re in range(len(results[ind][2])):
ws.cell(ind+2,3+re).value=results[ind][2][re]
ws.cell(ind+2,5).value=results[ind][3]
wb.save('books.xlsx')