版权声明:转载请注明出处 https://blog.csdn.net/nanhuaibeian/article/details/86644582
- 审查元素
- 分析
- 功能代码
#获取标题和URL的方法
def extract_tag_a(self,columns,index):
title = columns[index].xpath('a')[0].text
url = columns[index].xpath('a')[0].attrib['href']
return title,url
#获取评分、like数量、回复数量
def extract_text(self,columns,index):
tt = columns[index].text
#如果tt为None时需要将其替换为0
if tt == None:
tt = 0
return tt
#获取主题列表内容
def get_post_list(self):
rows = self.tree.xpath("//table[@class='board-list tiz']/tbody/tr")
posts = []
for row in rows:
post = {}
columns = row.xpath('td')
# 获取文章标题和文章URL的方法
post['title'], post['url'] = self.extract_tag_a(columns,1)
# 获取作者姓名和作者URL的方法
post['author_id'],post['author_url'] = self.extract_tag_a(columns,3)
#获取评分
post['rating'] = self.extract_text(columns,4)
# 获取Like数量
post['num_likes'] = self.extract_text(columns,5)
# 获取回复数量
post['num_replies'] = self.extract_text(columns,6)
posts.append(post)
return posts
- 主要代码
import re
import requests
from lxml import etree
class PostListCrawler:
domain = "https://www.newsmth.net"
def get_content(self,board_url,page):
querystring = {"ajax": "", "p": str(page)}
url = self.domain + board_url
r = requests.get(url,params=querystring)
#方便调用
self.html = r.text
self.tree = etree.HTML(r.text)
#获取最大页码
def get_max_page(self):
tree = etree.HTML(self.html)
pages = tree.xpath('//ol[@class="page-main"][1]/li')
#只有一页的情况
if len(pages) == 1:
return 1
#页面没有在最后一页时最大的页码
last_page_test = pages[len(pages)-1].xpath('a')[0].text
#如果页面在最后一页
if last_page_test == '>>':
return int(pages[len(pages)-2].xpath('a')[0].text)
return last_page_test
#获取标题和URL的方法
def extract_tag_a(self,columns,index):
title = columns[index].xpath('a')[0].text
url = columns[index].xpath('a')[0].attrib['href']
return title,url
#获取评分、like数量、回复数量
def extract_text(self,columns,index):
tt = columns[index].text
#如果tt为None时需要将其替换为0
if tt == None:
tt = 0
return tt
def get_post_list(self):
rows = self.tree.xpath("//table[@class='board-list tiz']/tbody/tr")
posts = []
for row in rows:
post = {}
columns = row.xpath('td')
# 获取文章标题和文章URL的方法
post['title'], post['url'] = self.extract_tag_a(columns,1)
# 获取作者姓名和作者URL的方法
post['author_id'],post['author_url'] = self.extract_tag_a(columns,3)
#获取评分
post['rating'] = self.extract_text(columns,4)
# 获取Like数量
post['num_likes'] = self.extract_text(columns,5)
# 获取回复数量
post['num_replies'] = self.extract_text(columns,6)
posts.append(post)
return posts
if __name__ == "__main__":
plc = PostListCrawler()
content = plc.get_content('/nForum/board/AutoWorld',1)
print(plc.get_post_list())