爬取xx百科首页数据

#爬取糗事百科首页数据
import requests
from lxml import etree

def load_page(url):
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
    html = requests.get(url,headers = headers).text.replace('\n','')
    deal_data(html)

def deal_data(html):
    data_list = etree.HTML(html).xpath("//div[contains(@id,'qiushi_tag_')]")
    
    for data in data_list:
        username = data.xpath("./div/a/h2/text()")
        content = data.xpath(".//div[@class='content']/span/text()")[0]
        img = data.xpath(".//div[@class='thumb']//img/@src")
        zan = data.xpath(".//i/text()")[0]
        comment = data.xpath(".//i//text()")[1]
        res_data = {"username" : username, "content" : content, "img" : img, "zan" : zan, "comment" : comment}
        print(res_data)

def main():
    page_num = input("请输入要爬取的页码:")
    url = "https://www.qiushibaike.com/8hr/page/%s/"%page_num
    load_page(url)

if __name__ == "__main__":
    main()

猜你喜欢

转载自blog.csdn.net/sdzhr/article/details/80962981