不同网站解析不同,下面爬取的是电子商城是:https://www.asos.de/damen 一家德国的电子商城
1.爬取主页https://www.asos.de/damen ,获得导航栏全部的分类链接
2.遍历分类链接,爬取一个分类链接下全部商品,包括分页的商品信息
3.将一个分类链接下的全部商品保存到excel表格,同时记录已爬取的分类链接
4.爬虫结束
知识点:
1.requests请求网页
2.基于BeautifulSoup的网页解析,本脚本主要用了 CSS选择器(https://blog.csdn.net/lzz781699880/article/details/81209038)
3.正则表达式
4.xlwt和文件读写操作
import requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
#获得静态的界面
def get_static_html(site_url):
print('开始加载', site_url, '页面')
headers_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
]
headers = {
'user-agent': headers_list[random.randint(0,len(headers_list))-1],
'Connection': 'keep - alive'
}
try:
resp = requests.get(site_url, headers=headers)
except Exception as inst:
print(inst)
requests.packages.urllib3.disable_warnings()
resp = requests.get(site_url, headers=headers,verify=False)
soup = BeautifulSoup(resp.text, 'html.parser')
return soup
#下载html 文件,并且进行css和js文件的替换
def download_html(content, html_path):
if not os.path.exists(html_path): # 文件夹不存在就创建按文件夹
os.makedirs(html_path)
print('download htmlfile path is:','{}.html'.format(html_path))
try:
with open('{}.html'.format(html_path), 'w+', encoding="utf-8") as f:
f.write(content)
f.close()
except Exception as e:
print(e)
#下载到表格
def exportTask(heads,task_done,path,filename):
if not os.path.exists(path):
os.makedirs(path)
task_xls = xlwt.Workbook(encoding='utf-8')
task_sheet1 = task_xls.add_sheet('sheet1')
#表头
header_allign = xlwt.Alignment()
header_allign.horz = xlwt.Alignment.HORZ_CENTER
header_style = xlwt.XFStyle()
header_style.alignment = header_allign
for i in range(len(heads)):
task_sheet1.col(i).width = 12000
task_sheet1.write(0,i,heads[i],header_style)
#开始插入
for i in range(len(task_done)):
for j in range(len(heads)):
task_sheet1.write(i+1,j,task_done[i][heads[j]])
filename = "{0}.xls".format(filename.replace(':','-'))
print(os.path.join(path,filename))
task_xls.save(os.path.join(path,filename))
return filename
#获得大分类下的总页数
def getTotalPageNums(url):
soup = get_static_html(url)
page_msg_tage = soup.select('._2sxPqJf')
if len(page_msg_tage) == 0:
return 0,''
page_msg = page_msg_tage[0].text
cate = soup.select('._2wckrGM')[0].text
total_num = page_msg.split(' ')[4]
page_num = math.ceil(int(total_num.replace('.','')) / 72)
print(page_num)
return page_num,cate
#获得某个分类的商品的全部信息,包括分页
def getInfoFromSoup(url):
soup = get_static_html(url)
if len(soup.select('._2wckrGM')) == 0:
return []
else:
cate = soup.select('._2wckrGM')[0].text
info_list = []
for tag in soup.select('._3x-5VWa'):
# print(tag)
img_tag = tag.select("._1FN5N-P > img")[0]
desc_tag = tag.select('._10-bVn6 > div > p')[0]
info = { 'cate_url': url,'cate': cate}
info['desc'] = desc_tag.text
if len(tag.select('.JW3hTZk')) == 0:
price_tage = tag.select('._342BXW_')[0]
price_msg = price_tage.text
else:
price_tage = tag.select('.JW3hTZk')[0]
if '"' in price_tage.text:
price_msg = price_tage.text.split['"'][1]
else:
price_msg = price_tage.text
info['price'] = price_msg
price_arr = price_msg.split(' €')
if len(price_arr) != 0:
info['price_num'] = float(price_arr[0].replace(',','.'))
else:
info['price_num'] = 0
info['product link'] = tag.attrs['href']
patttern = re.compile(r'[prd|grd]\/([0-9]+)\?')
info_id = patttern.findall(tag.attrs['href'])
info['product_id'] = info_id[0]
if 'src' in img_tag.attrs.keys():
info['img_url'] = img_tag.attrs['src']
else:
info['img_url'] = ''
info_list.append(info)
return info_list
#获得主页下商品的分类链接
def getCateUrl():
url = 'https://www.asos.de/damen'
soup = get_static_html(url)
nav_tage = soup.select('._3kg3G5e')[0]
cate_list = []
for a_tage in nav_tage.select('a'):
cate_list.append(a_tage.attrs['href'])
return cate_list
#将某个分类下的商品存到excel表格
def dowloadExcelByCate(cate_url,path,num):
pagenum,cate = getTotalPageNums(cate_url)
info_list = []
if pagenum > 0:
for i in range(1, pagenum + 1):
url_page = '{0}&page={1}'.format(cate_url, i)
info_list += getInfoFromSoup(url_page)
time.sleep(5)
heads = ['cate_url','cate','desc','price','price_num','product_id','product link','img_url']
filename = '{0}-{1}'.format(num,cate)
exportTask(heads, info_list, path, filename)
try:
with open(path+'record.txt', 'a+', encoding="utf-8") as f:
f.write(cate_url+'\n')
f.close()
except Exception as e:
print(e)
#获得已经爬取的分类链接
def getDoneUrl(path):
done_url = []
with open(path + 'record.txt', 'r', encoding="utf-8") as f:
url_list = f.readlines()
for url in url_list:
done_url.append(url.rstrip('\n'))
print(done_url)
return done_url
#合并为一个表格
def connectToOne(dir,to_dir):
excel_list = []
for file in os.listdir(dir):
if file.endswith('.xls'):
print("file:",file)
excel_list.append(pd.read_excel(os.path.join(dir,file)))
print('开始合并')
total_excel = pd.concat(excel_list)
print('生成文件')
total_excel.to_excel(os.path.join(to_dir,'asos.xlsx'),index=False)
save_path = 'C:\\Users\\SHEIN\\Desktop\\asos\\'
if __name__ == '__main__':
cate_url_list = getCateUrl()
done_url = getDoneUrl(save_path)
for i in range(len(cate_url_list)):
if cate_url_list[i] not in done_url:
dowloadExcelByCate(cate_url_list[i],save_path,i+1)
#connectToOne(save_path,'C:\\Users\\SHEIN\\Desktop')