数据库篇:
代码:
# URL解码,可以将字符串加入到url中去
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from pymongo import MongoClient
import requests
# Mongode的连接信息
client = MongoClient()
# 创建数据库
db = client['weibo']
# 创建微博这个表
collection = db['weibo']
# 真正的请求url
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2145291155',
'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type': 'uid',
'value': '2145291155',
'containerid': '1076032145291155',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
# 定义解析方法,从结果中提取出想要的信息内容
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
def save_to_mongo(result):
if collection.insert(result):
print('Saved to Mongo')
if __name__ == '__main__':
for page in range(1,17):
json = get_page(page)
results = parse_page(json)
for result in results:
print(result)
save_to_mongo(result)
成果图:
CSV表格存储篇:
代码:
# URL解码,可以将字符串加入到url中去
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import requests
import csv
# 真正的请求url
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
# 服务器根据Host这一行中的值来确定本次请求的是哪个具体的网站
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2145291155',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type': 'uid',
'value': '2145291155',
'containerid': '1076032145291155',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
# 定义解析方法,从结果中提取出想要的信息内容
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
if __name__ == '__main__':
for page in range(1,18):
json = get_page(page)
results = parse_page(json)
for result in results:
with open('data.csv', 'a', newline='', encoding='utf-8-sig') as f:
fieldnames = ['id', 'text', 'attitudes', 'comments', 'reposts']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow({'id': result.get('id'), 'text': result.get('text'), 'attitudes': result.get('attitudes'), 'comments': result.get('comments'), 'reposts': result.get('reposts')})
成果图:
TXT存储篇:
代码:
# URL解码,可以将字符串加入到url中去
from urllib.parse import urlencode
# 导入解析库
from pyquery import PyQuery as pq
import requests
# 真正的请求url
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
# 服务器根据Host这一行中的值来确定本次请求的是哪个具体的网站
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2145291155',
# 爬虫请求头
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
# 页面的类型
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
# 动态页面的参数构造
params = {
'type': 'uid',
'value': '2145291155',
'containerid': '1076032145291155',
'page': page
}
# 真实请求url
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# 返回json文件
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
# 定义解析方法,从结果中提取出想要的信息内容
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = str(item.get('id'))
# 利用pq的解析方法可以直接获取到所需要的txt文本,从而实现文本的快速提取。
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = str(item.get('attitudes_count'))
weibo['comments'] = str(item.get('comments_count'))
weibo['reposts'] = str(item.get('reposts_count'))
yield weibo
if __name__ == '__main__':
for page in range(1,18):
json = get_page(page)
results = parse_page(json)
for result in results:
with open('weibo.txt', 'a', encoding='utf-8-sig') as f:
# 列表中不可含有数字的内容
f.write('\n'.join([result.get('id'), result.get('text'), result.get('attitudes'), result.get('comments'), result.get('reposts')]))
f.write('\n'+ '='*60 +'\n')
成果图: