在上次的基础上,自己摸索着调试出对淘宝商品的搜索和整理,其中主要使用了正则表达式,基本代码如下:
# CrowTaobaoPrice import requests import re import traceback def getHtmlText(url): r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text def parsePage( html): goodsInfo = [] try: goods_name_sub = re.findall(r'\"raw_title\"\:\".*?\"', html) #*?表示最小匹配 goods_price_sub = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html) #查找 view_price所对应的数值,即:后面到 . 的数字 goods_saler_sub = re.findall(r'\"nick\"\:\".*?\"', html) #表示查找nick 对应的字符串,采取最小匹配的方式 goods_local_sub = re.findall(r'\"item_loc"\:\".*?\"', html) for i in range(len(goods_price_sub)): goods_name = eval(goods_name_sub[i].split(':')[1]) goods_price = eval(goods_price_sub[i].split(':')[1]) goods_saler = eval(goods_saler_sub[i].split(':')[1]) goods_local = eval(goods_local_sub[i].split(':')[1]) goodsInfo.append([goods_name, goods_price, goods_saler, goods_local]) #goodsInfo.append(goods_name) #goodsInfo.append(goods_price) #goodsInfo.append(goods_saler) #goodsInfo.append(goods_local) return goodsInfo except: print("解析出错了") def printGoodsList(goodsList): #tplt = "{0:^5}\t\t{1:^10}\t{2:^10}\t{3:^10}\t{4:^10}" #print('{0:^4}\t{1:^35}\t{2:^10}\t{3:^10}\t{4:^10}'.format("序号", "商品名称", "价格", "商家", "所在地", chr(12288))) print("序号", "商品名称", "价格", "商家", "所在地", chr(12288)) count = 1 for i in range(len(goodsList)): for j in range(len(goodsList[i])): print(count, goodsList[i][j][0], goodsList[i][j][1], goodsList[i][j][2], goodsList[i][j][3]) count = count+1 # print(tplt.format(i+1, goodsList[i][0], goodsList[i][1], goodsList[i][2], goodsList[i][3], chr(12288))) #print(i + 1, goodsList[i][0], goodsList[i][1], goodsList[i][2], goodsList[i][3]) # print(tplt.format(i+1, goodsList[count], goodsList[count+1], goodsList[count+2], goodsList[count+3], chr(12288))) # print(i+1, goodsList[count], goodsList[count+1], goodsList[count+2], goodsList[count+3]) # count = count+4 def main(): goodsName = "cat男鞋" searchdepth = 2 url = "https://s.taobao.com/search?q=" + goodsName img_0 = '&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180513&ie=utf8&bcoffset=3&ntoffset=0&p4ppushleft=1%2C48&s=' gList = [] for i in range(searchdepth): url = url + img_0 + str(44 * i) html = getHtmlText(url) gList.append(parsePage(html)) printGoodsList(gList) if __name__ == '__main__': try: main() except: print("出现错误") traceback.print_exc()