- 需修改
output_file
变量
- 东方财富网 + 腾讯证券
import re
import requests
import traceback
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status
r.encoding = r.apparent_encoding
return r.text
except:
print("访问失败")
return ""
def getStockList(ls, stockurl):
html = getHtmlText(stockurl)
soup = BeautifulSoup(html, "html.parser")
for i in soup.find_all('a'):
try:
href = i.attrs['href']
ls.append(re.findall(r'[s][hz]\d{6}', href)[0])
except:
continue
def getStockInfo(ls, stockurl, fpath):
for stock in ls:
url = stockurl + stock + "/gp"
html = getHtmlText(url)
try:
if html == "":
continue
infoDict = {}
soup = BeautifulSoup(html, 'html.parser')
stockName = soup.find('div', attrs={'class':'title_bg'})
stockInfo = soup.find('div', attrs={'class':'col-2 fr'})
name = stockName.find_all(attrs={'class':'col-1-1'})[0]
if name.text.split()[0] =='--':
continue
infoDict.update({'股票名称':name.text.split()[0]})
info = stockInfo.find_all('li')
for i in info:
key = re.findall('>.*?<', str(i))[1][1:-1]
key = key.replace('\u2003','')
key = key.replace('\xa0','')
try:
val = re.findall('>.*?<', str(i))[3][1:-1]
except:
val = '--'
infoDict[key] = val
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
except:
continue
def main():
stock_list_url = 'http://quote.eastmoney.com/stock_list.html'
stock_info_url = 'http://gu.qq.com/'
output_file = '/home/lwy/Spiders/stockinfo.txt'
slist = []
getStockList(slist, stock_list_url)
getStockInfo(slist, stock_info_url, output_file)
main()