import requests
from bs4 import BeautifulSoup
import xlwt
import sys
def isConnected():
import requests
try:
html = requests.get("http://www.baidu.com",timeout=2)
except:
return False
return True
if not isConnected():
print("网络连接失败")
sys.exit(0)
url = "http://mxd.sdo.com/web6/home/index.asp"
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
html_code = requests.get(url,headers=headers)
soup = BeautifulSoup(html_code.text,"html.parser")
soup_1 = soup.find("div",attrs={"class","news-list"})
soup_2 = soup_1.find_all("a")
i= 0
for ele in soup_2:
# print(i,"-", ele)
kw = "更多"
if kw in ele:
pass
else:
i = i + 1
if ".." in ele.get("href"):
print(i, "-", ele.get_text(), "-","http://mxd.sdo.com/web6"+ ele.get("href").replace("..",""))
else:
print(i, "-", ele.get_text(), "-",ele.get("href"))
head = ["ID","新闻名称","地址"]
workbook = xlwt.Workbook(encoding="utf-8")
sheet_1 = workbook.add_sheet("sheet1")
sheet_1.col(1).width = 16000
sheet_1.col(2).width = 18000
for i in range(len(head)):
sheet_1.write(0,i,head[i])
for i in range(len(soup_2)-1):
sheet_1.write(i+1, 0,i+1)
name = []
for ele_2 in soup_2:
if not (kw in ele_2):
name.append(ele_2.get_text())
for i in range(len(soup_2)-1):
sheet_1.write(i + 1, 1,name[i])
links = []
for ele_2 in soup_2:
if ".." in ele_2.get("href"):
links.append("http://mxd.sdo.com/web6" + ele_2.get("href").replace("..", ""))
else:
links.append(ele_2.get("href"))
for i in range(len(soup_2)-1):
sheet_1.write(i + 1, 2,links[i])
workbook.save("test.xlsx")
写出excel截图: