只爬取指定的类的div
from bs4 import BeautifulSoup
soup=BeautifulSoup(h,'html.parser')
a=soup.find_all('div',class_="post-text")
ar=re.compile(r'<[^>]+>',re.S)
ad=ar.sub('',str(a))
h:爬取的html内容
class_:要爬取的div的class
ar:正则找标签
ad:去标签
import requests
import re
from bs4 import BeautifulSoup
import time
import re
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
for k in range(33):
res=requests.get("https://stackoverflow.com/search?page="+str(k)+"&q=docker+cpu",headers=headers)
html=res.text
url=re.findall('<a href="(.*?)" data-searchsession=".*?" title="(.*?)" class="question-hyperlink">',html)
for j in range(len(url)):
u=url[j]
file_name=u[0].split('/')[-2]
response=requests.get("https://stackoverflow.com"+u[0],headers=headers)
h=response.text
soup=BeautifulSoup(h,'html.parser')
a=soup.find_all('div',class_="post-text")
for i in range(len(a)):
with open("D://stack//" + file_name +'['+str(i)+'].txt', 'w+', encoding="utf-8") as f:
ar=re.compile(r'<[^>]+>',re.S)
ad=ar.sub('',str(a[i]))
f.write(ad)
f.close()
time.sleep(1)
time.sleep(10)