以下代码是我用在我的个人网站项目上用于解析淘宝页面的,网站(fuckinstall.com)的主要功能是将几个搜索引擎的内容后台整合在一起,结果经过相似度排序及聚类处理。顺带还做了个谷歌镜像的页面,前端是真心不太会。。
#coding=utf8
from ..common import crawlerTool as ct
from HTMLParser import HTMLParser#这个出来是unicode的格式,后面没法弄
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import re
import traceback
import json
# 摘取所要数据
def process(keyword,page):
url='https://s.taobao.com/search?q=%s&s=%s' % (keyword, (page-1)*44)
urlinsfos=[]
page = ct.crawlerTool.getPage(url)
g_page_config =ct.crawlerTool.getRegex('g_page_config\s*=\s*(.*);',page)
#print eval(g_page_config)['mod']['data']['auctions']
try:
segments = json.loads(g_page_config)['mods']['itemlist']['data']['auctions'] #搜索微波炉就不用这个了
except:
segments = []
if segments:
#print segments[0]
for segment in segments:
try:
#print segment
urlinfo={}
urlinfo['url']='https://detail.tmall.com/item.htm?id='+segment['nid']
urlinfo['title'] = segment['raw_title']
if 'tmall' in urlinfo['url']:
urlinfo['title']=urlinfo['title']+'-天猫'
urlinfo['source'] = 'tmall'
else:
urlinfo['title'] = urlinfo['title'] + '-淘宝'
urlinfo['source'] = 'taobao'
num=segment.get('view_sales','0')
price = segment["view_price"]
urlinfo['info'] = '价格<em>%s</em>元 购买数量<em>%s</em>'%(price,num)
urlinfo['imglink'] = segment["pic_url"]
#print urlinfo['url'], urlinfo['title'], urlinfo['info'],urlinfo['imglink']
urlinsfos.append(urlinfo)
except:
traceback.print_exc()
else:
segments = json.loads(g_page_config)['mods']['grid']['data']['spus']
for segment in segments:
try:
#print segment
urlinfo={}
urlinfo['url']=segment['url']
urlinfo['title'] = segment['title']
if 'tmall' in urlinfo['url']:
urlinfo['title']=urlinfo['title']+'-天猫'
urlinfo['source'] = 'tmall'
else:
urlinfo['title'] = urlinfo['title'] + '-淘宝'
urlinfo['source'] = 'taobao'
importantKey = segment['importantKey']
price = segment["price"]
urlinfo['info'] = '价格<em>%s</em>元 <em>%s</em> '%(price,importantKey)
urlinfo['imglink'] = segment["pic_url"]
#print urlinfo['url'], urlinfo['title'], urlinfo['info'],urlinfo['imglink']
urlinsfos.append(urlinfo)
except:
traceback.print_exc()
return urlinsfos
def test():
return process("https://s.taobao.com/search?q=python")