import requests
from bs4 import BeautifulSoup
import re
import time
import lxml
class Spider():
headers = {'User-Agent': 'Mozilla/5.0 (Windows'
' NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTM'
'L, like Gecko) Chrome/79'
'.0.3945.88 Safari/537.36'
}
def __analyse(self,url):
f = open('C:/Users/baishuai/Desktop/斗破苍穹.txt', 'a+')
html = requests.get(url, headers=Spider.headers)
if html.status_code == 200:
contents=re.findall('<p>(.*?)</p>',html.content.decode('utf-8'),re.S)
for content in contents:
print (content)
f.write(content+'\n')
else:
pass
f.close()
def __urll(self):
urls=['http://www.doupoxs.com/doupocangqiong/{}.html'.format(str(i)) for i in range(1562,1666)]
for url in urls:
self.__analyse(url)
time.sleep(1)
def go(self):
self.__urll()
spider=Spider()
spider.go()