python按分类爬取电子商城商品信息

不同网站解析不同，下面爬取的是电子商城是：https://www.asos.de/damen 一家德国的电子商城
1.爬取主页https://www.asos.de/damen ，获得导航栏全部的分类链接
2.遍历分类链接，爬取一个分类链接下全部商品，包括分页的商品信息
3.将一个分类链接下的全部商品保存到excel表格，同时记录已爬取的分类链接
4.爬虫结束
知识点：
1.requests请求网页
2.基于BeautifulSoup的网页解析，本脚本主要用了 CSS选择器（https://blog.csdn.net/lzz781699880/article/details/81209038）
3.正则表达式
4.xlwt和文件读写操作
import  requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup



#获得静态的界面
def get_static_html(site_url):
    print('开始加载', site_url, '页面')
    headers_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ]
    headers = {
        'user-agent': headers_list[random.randint(0,len(headers_list))-1],
        'Connection': 'keep - alive'
    }
    try:
        resp = requests.get(site_url, headers=headers)
    except Exception as inst:
        print(inst)
        requests.packages.urllib3.disable_warnings()
        resp = requests.get(site_url, headers=headers,verify=False)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup



#下载html 文件，并且进行css和js文件的替换
def download_html(content, html_path):
    if not os.path.exists(html_path):  # 文件夹不存在就创建按文件夹
        os.makedirs(html_path)
    print('download htmlfile path is:','{}.html'.format(html_path))
    try:
        with open('{}.html'.format(html_path), 'w+', encoding="utf-8") as f:
            f.write(content)
            f.close()
    except Exception as e:
        print(e)



#下载到表格
def exportTask(heads,task_done,path,filename):
    if not os.path.exists(path):
        os.makedirs(path)
    task_xls = xlwt.Workbook(encoding='utf-8')
    task_sheet1 = task_xls.add_sheet('sheet1')
    #表头
    header_allign = xlwt.Alignment()
    header_allign.horz = xlwt.Alignment.HORZ_CENTER
    header_style = xlwt.XFStyle()
    header_style.alignment = header_allign
    for i in  range(len(heads)):
        task_sheet1.col(i).width = 12000
        task_sheet1.write(0,i,heads[i],header_style)
    #开始插入
    for i in range(len(task_done)):
        for j in range(len(heads)):
            task_sheet1.write(i+1,j,task_done[i][heads[j]])
    filename = "{0}.xls".format(filename.replace(':','-'))
    print(os.path.join(path,filename))
    task_xls.save(os.path.join(path,filename))
    return filename



#获得大分类下的总页数
def getTotalPageNums(url):
    soup = get_static_html(url)
    page_msg_tage = soup.select('._2sxPqJf')

    if len(page_msg_tage) == 0:
        return 0,''
    page_msg = page_msg_tage[0].text
    cate = soup.select('._2wckrGM')[0].text
    total_num = page_msg.split(' ')[4]
    page_num = math.ceil(int(total_num.replace('.','')) / 72)
    print(page_num)
    return page_num,cate



#获得某个分类的商品的全部信息，包括分页
def getInfoFromSoup(url):
    soup = get_static_html(url)
    if len(soup.select('._2wckrGM')) == 0:
        return []
    else:
        cate = soup.select('._2wckrGM')[0].text
        info_list = []
        for tag in soup.select('._3x-5VWa'):
            # print(tag)
            img_tag = tag.select("._1FN5N-P > img")[0]
            desc_tag = tag.select('._10-bVn6 > div > p')[0]
            info = { 'cate_url': url,'cate': cate}
            info['desc'] = desc_tag.text
            if len(tag.select('.JW3hTZk')) == 0:
                price_tage = tag.select('._342BXW_')[0]
                price_msg = price_tage.text
            else:
                price_tage = tag.select('.JW3hTZk')[0]
                if '"' in price_tage.text:
                    price_msg = price_tage.text.split['"'][1]
                else:
                    price_msg = price_tage.text
            info['price'] = price_msg
            price_arr = price_msg.split(' €')
            if len(price_arr) != 0:
                info['price_num'] = float(price_arr[0].replace(',','.'))
            else:
                info['price_num'] = 0
            info['product link'] = tag.attrs['href']
            patttern = re.compile(r'[prd|grd]\/([0-9]+)\?')
            info_id = patttern.findall(tag.attrs['href'])
            info['product_id'] = info_id[0]
            if 'src' in img_tag.attrs.keys():
                info['img_url'] = img_tag.attrs['src']
            else:
                info['img_url'] = ''
            info_list.append(info)
        return info_list



#获得主页下商品的分类链接
def getCateUrl():
    url = 'https://www.asos.de/damen'
    soup = get_static_html(url)
    nav_tage = soup.select('._3kg3G5e')[0]
    cate_list = []
    for a_tage in nav_tage.select('a'):
        cate_list.append(a_tage.attrs['href'])
    return cate_list



#将某个分类下的商品存到excel表格
def dowloadExcelByCate(cate_url,path,num):
    pagenum,cate = getTotalPageNums(cate_url)
    info_list = []
    if pagenum > 0:
        for i in range(1, pagenum + 1):
            url_page = '{0}&page={1}'.format(cate_url, i)
            info_list += getInfoFromSoup(url_page)
            time.sleep(5)
        heads = ['cate_url','cate','desc','price','price_num','product_id','product link','img_url']
        filename = '{0}-{1}'.format(num,cate)
        exportTask(heads, info_list, path, filename)
        try:
            with open(path+'record.txt', 'a+', encoding="utf-8") as f:
                f.write(cate_url+'\n')
                f.close()
        except Exception as e:
            print(e)



#获得已经爬取的分类链接
def getDoneUrl(path):
    done_url = []
    with open(path + 'record.txt', 'r', encoding="utf-8") as f:
        url_list = f.readlines()
        for url in url_list:
            done_url.append(url.rstrip('\n'))
        print(done_url)
    return done_url



#合并为一个表格
def connectToOne(dir,to_dir):
    excel_list = []
    for file in os.listdir(dir):
        if file.endswith('.xls'):
            print("file:",file)
            excel_list.append(pd.read_excel(os.path.join(dir,file)))
    print('开始合并')
    total_excel = pd.concat(excel_list)
    print('生成文件')
    total_excel.to_excel(os.path.join(to_dir,'asos.xlsx'),index=False)




save_path = 'C:\\Users\\SHEIN\\Desktop\\asos\\'



if __name__ == '__main__':
    cate_url_list = getCateUrl()
    done_url = getDoneUrl(save_path)
    for i in range(len(cate_url_list)):
       if cate_url_list[i] not in done_url:
            dowloadExcelByCate(cate_url_list[i],save_path,i+1)

    #connectToOne(save_path,'C:\\Users\\SHEIN\\Desktop')
python按分类爬取电子商城商品信息

猜你喜欢