本篇博客提供了4种方式,有简略版仅能完成要求却简陋,也有较为完整的方式
1.
# -*- coding:utf-8 -*-
import re
import urllib2
import json
import sys
if sys.getdefaultencoding() != 'utf-8':
reload(sys)
sys.setdefaultencoding('utf-8')
class JDSpider:
def loadPage(self):
url = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv566&productId=100001906474&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1"
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
headers = {'User-Agent': user_agent}
req = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(req)
html = response.read()
pattern = re.compile('"content":"(.*?)".*?"nickname":"(.*?)"')
# print html.decode("gbk")
# pattern = re.compile(r',"creationTime"(.*?)"nickname":')
# html = pattern.sub(r'', html)
# print html
# pattern = re.compile(r'"content":"(.*?)""(.*?)"')
item_list = pattern.findall(html)
print item_list
for item in item_list:
self.writetoFile(item)
# def printPage(self,item_list,page):
# print item_list
# print "=====爬取第%d页===="%page
# for item in item_list:
# self.writetoFile(item)
def writetoFile(self,test):
with open("d:/124/jindong1.txt", 'a+')as myFile:
json.dump(test, myFile, ensure_ascii=False, encoding='utf-8')
# myFile.write(str(test).decode("utf-8"))
myFile.write("\n-------------------------------------\n")
myFile.close()
# def doWork(self):
# while self.enable:
# try:
# item_list = self.loadPage(self.page)
# except urllib2.URLError,e:
# print e.reason
# continue
# self.printPage(item_list,self.page)
# self.page+=1
# print "按回车继续"
# print "输入quit退出"
# command = raw_input()
# if(command=="quit"):
# break
if __name__ == '__main__':
"""
======================
京东评论爬虫
======================
"""
#定义一个JDSpider对象
mySpider = JDSpider()
mySpider.loadPage()
2.
# -*- coding:utf-8 -*-
import urllib2
import json
import sys
class Comment:
def Commets(self):
reload(sys)
sys.setdefaultencoding('utf-8')
f=open('02.txt', 'w')
for i in range(0, 10):
url='https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv562&productId=100001906474&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' + str(
i) + '&pageSize=10&isShadowSku=0&fold=1'
# 爬多页
request = urllib2.Request(url)
response = urllib2.urlopen(request)
html = response.read().decode('GBK')
html = html.replace('fetchJSON_comment98vv562(', '')
html = html.replace(');', '')
# 去掉多余字符
b = json.loads(html)
print b
for k in b['comments']:
content = k["content"].encode('utf-8')
self.writeToFile(content)
f.write(k["content"].encode('utf-8') + '\n')
referenceName = k["nickname"].encode('utf-8')
self.writeToFile(referenceName)
f.write(k["nickname"].encode('utf-8') + '\n')
referenceTime = k["referenceTime"].encode('utf-8') + '\n\n'
self.writeToFile(referenceTime)
def writeToFile(self, text):
# @brief 将数据追加进文件内容
# @param text文件内容
with open("d:/124/jd.txt", 'a') as myFile:
myFile.write(text)
myFile.write("\n-----------------------------------------------")
myFile.close()
if __name__ == '__main__':
Comment = Comment()
Comment.Commets()
3.
# - * - coding: UTF-8 - * -
import urllib2
import json
import sys
reload(sys)
sys.setdefaultencoding('utf8')
f = open('01.txt', 'w')
for i in range(0, 10):
url = 'https://sclub.jd.com/comment/productPageComments.action?cal lback=fetchJSON_comment98vv562&productId=100001906474&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' + str(i) + '&pageSize=10&isShadowSku=0&fold=1'
# 实现爬多页
print url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
html = response.read().decode('GBK')
html = html.replace('fetchJSON_comment98vv562(', '')
html = html.replace(');', '')
# 去掉多余的字符
b = json.loads(html)
for k in b['comments']:
content = k["content"].encode('utf-8')
print content
f.write(k["content"].encode('utf-8') + '\n')
referenceName = k["referenceName"].encode('utf-8')
print referenceName
f.write(k["referenceName"].encode('utf-8') + '\n')
referenceTime = k["referenceTime"].encode('utf-8')
print referenceTime
f.write(k["referenceTime"].encode('utf-8') + '\n\n')