版权声明:本文为博主原创文章,如果转走了就评论说一声就好了哈。 https://blog.csdn.net/qq_36124802/article/details/80446684
#coding=utf-8
import json
from _md5 import md5
from multiprocessing.pool import Pool
import re
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import time
import pymongo as pm
import random
#get_info获取大类url
#channel_extract获取物品url
#page获取物品详细信息
#长数据用三引号去做
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
proxy_list = [
'http://39.137.77.68:8080',
'http://221.130.253.135:8090',
]
proxy_ip = random.choice(proxy_list)
proxies = {'http':proxy_ip}
client = pm.MongoClient(host='localhost', port=27017)
db = client.page_58
collection1 = db.url_find
collection2 = db.url_found
#这里从数据库中获取全部之前储存过的信息
url_find = collection1.find()
url_found = collection2.find()
db_urls = []
index_urls = []
for item in url_find:
db_urls.append(item['url'])
for item in url_found:
index_urls.append(item['url'])
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y #获取find与found的不相交部分,就是为查找过的部分
# for item in rest_of_urls:
# print(item)
url = 'http://xa.58.com//shouji/'
def get_item_info(url): #得到网页并分析网页得到想要的部分
global db
collection1 = db.commodity
collection2 = db.url_found
try:
wb_data = requests.get(url,headers=HEADERS)
soup = BeautifulSoup(wb_data.text,'lxml')
#no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
#print(soup.text)
#这里可以通过span.class的方式,也可以通过seletor的方式获取
prices = soup.select('span.price_now') #这里上拿到具体的信息
titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
types = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
places = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
contents = soup.select('body > div.content > div > div.box_left > div > div > div > p')
looks = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')
#获取标题,类型(多类型如何分割,价格,浏览次数,详细内容,位置
if not prices:#价格
price = '未知'
else:
price = prices[0].get_text()
if not titles:#标题
title = '未知'
else:
title = titles[0].get_text()
if not contents:#内容
content = '未知'
else:
content = contents[0].get_text()
if not looks:
look = '未知'
else:
look = looks[0].get_text()
if not places:
place = '未知'
else:
place = places[0].get_text()
# title= titles[0].get_text()
# place = places[0].get_text()
# content = contents[0].get_text()
# look = looks[0].get_text()
#把结果存入到一个数组当中
type=[]
for string in types[0].stripped_strings:
type.append(string)
#print(list[string])
info = {
'price': price,
'title': title,
'type': type,
'content': content,
'look': look,
}
data = {
'url': url
}
collection1.insert(info)
collection2.insert(data)
print(info)
#print(price,title,type,place,content,look)
return info
except Exception as e:
print(e)
#这里上把所有页数的数据全部提取
# #每个url爬多遍,然后其上第几页,类型为什么
#
# #然后可以通过从数据库中拿到数据,然后做成可视化表格去供我们浏览
#
def get_all_links_from(channel):
get_item_info(channel)
if __name__ == '__main__': #避免调用程序出现名称混乱
pool = Pool()#多进程处理,开多少进程,如果有几核就开几个进程
pool.map(get_all_links_from,rest_of_urls) #如果是参数,那么传入应该为一个数组
#如果是三引号的数据,可以通过分割空格作为不同的一个数组
#pool.map(get_all_links_from,list.split())
#coding=utf-8
import json
from _md5 import md5
from multiprocessing.pool import Pool
import re
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import pymongo as pm
import json
#创建浏览器的头去模拟浏览器获取信息
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
url = 'http://xa.58.com/'
url_host = 'http://xa.58.com/'
def get_urls(url):
res = requests.get(url,headers=HEADERS)
res.encoding = 'utf-8'
text = res.text
Soup = BeautifulSoup(text,'lxml')
contents = Soup.select('body > div.article > div.mainWrap > div.leftSide > div > div.fl.cbp2.cbhg > div > span > a')
contents2 = Soup.select('body > div.article > div.mainWrap > div.leftSide > div > div.fl.cbp2.cbhg > div > a')
tot =0
url_list1 = []
url_list2 = []
for content in contents: #对某个标签如a中拿取href的数据
#如果把两个数据打包为zip,那么存在一个数量多一个数量少,而数量多的那个数组多出来的部分会不见了
url = url_host + content.get('href')
tot =tot +1
if url.count('/')==5:
url_list1.append(url)
for content2 in contents2: #对某个标签如a中拿取href的数据
url2 = url_host + content2.get('href')
if(url2.count('/')==5):
url_list2.append(url2)
return url_list1,url_list2
url_list1,url_list2 = get_urls(url)
#这里获得两种不同物品放置类型的url
client = pm.MongoClient(host='localhost', port=27017)
db = client.page_58
collection = db.url
for item in url_list1:#插入到数据库中
url = {
'type': '1',
'url': item
}
collection.insert(url)
for item in url_list2:
url = {
'type': '2',
'url': item
}
collection.insert(url)
#print(images)
#dd:nth-child(1)改为dd:nth-of-type(1)
#选择到了自己想要的第一个位置
#在dd后面删除:nth-of-type(1),以便把所有信息均筛选出来
#汤勺可以筛选出想要的标签内容
#zip的用法是将多个函数的迭代器,合成一个迭代器
#coding=utf-8
import json
from _md5 import md5
from multiprocessing.pool import Pool
import re
import os
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import time
import pymongo as pm
import random
#长数据用三引号去做
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
proxy_list = [
'http://39.137.77.68:8080',
'http://221.130.253.135:8090',
]
proxy_ip = random.choice(proxy_list)
proxies = {'http':proxy_ip}
client = pm.MongoClient(host='localhost', port=27017)
db = client.page_58
collection = db.url
#这里从数据库中获取全部之前储存过的信息
url_list1 = collection.find({'type': '1'})
url_list2 = collection.find({'type': '2'})
url = 'http://xa.58.com//shouji/'
client2 = pm.MongoClient(host='localhost', port=27017)
db2 = client2.page_58
collection2 = db2.commodity
collection3 = db.url_find
collection4 = db.url_found
def get_item_info(url): #得到网页并分析网页得到想要的部分
try:
wb_data = requests.get(url,headers=HEADERS)
soup = BeautifulSoup(wb_data.text,'lxml')
#no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
#print(soup.text)
#这里可以通过span.class的方式,也可以通过seletor的方式获取
prices = soup.select('span.price_now') #这里上拿到具体的信息
titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
types = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
places = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
contents = soup.select('body > div.content > div > div.box_left > div > div > div > p')
looks = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')
#获取标题,类型(多类型如何分割,价格,浏览次数,详细内容,位置
if not prices:#价格
price = '未知'
else:
price = prices[0].get_text()
if not titles:#标题
title = '未知'
else:
title = titles[0].get_text()
if not contents:#内容
content = '未知'
else:
content = contents[0].get_text()
if not looks:
look = '未知'
else:
look = looks[0].get_text()
if not places:
place = '未知'
else:
place = places[0].get_text()
# title= titles[0].get_text()
# place = places[0].get_text()
# content = contents[0].get_text()
# look = looks[0].get_text()
#把结果存入到一个数组当中
type=[]
for string in types[0].stripped_strings:
type.append(string)
#print(list[string])
info = {
'price': price,
'title': title,
'type': type,
'content': content,
'look': look,
}
#print(price,title,type,place,content,look)
return info
except Exception as e:
print(e)
#功能1,保证保存的数据不会出现重复
#这里的type表示爬取的为哪一类数据
def get_links_form(channel,pages,type,who_sells=0):
global collection2,collection3
try:#报错还能接着
#print('现在爬虫爬取到了第',pages,'页')
#这里根据点进去的url选择一个合适的配凑获得url,真是需要爬取的url
list_view = '{}{}/pn{}'.format(channel,str(who_sells),str(pages))
#每个{}存的为一个字符
#print(list_view)
wb_data = requests.get(list_view,headers=HEADERS)
#print(wb_data.text)
#time.sleep(8) #如果不添加休息的话,会有反爬虫拒绝接入
soup = BeautifulSoup(wb_data.text,'lxml')
res = soup.select('#infolist > div > table > tbody > tr > td.img > a')
#print(soup.text)
if soup.find('td','t'):#判断是否存在一个标签来判断是否结束
for item in res:
href = item.get('href').split('?')[0]
#把后面的?都给去除掉
if href[7]=='j':
continue
#把数据转换为json格式存入数据库
data = {
'url' : href
}
#time.sleep(0.01)
collection3.insert(data) #把href的信息存入到url_find中,从这个find中找哪些需要爬
#将信息插入到商品一栏
#return info Wa:# 每一页的内容不应该直接返回,而是插入到数据库当中
#print(info)
else:#没有任何信息了
pass#跳过,不影响主进程
except Exception as e:
print(e)
#这里上把所有页数的数据全部提取
# #每个url爬多遍,然后其上第几页,类型为什么
#
# #然后可以通过从数据库中拿到数据,然后做成可视化表格去供我们浏览
#
def get_all_links_from(channel):
for num in range(1,100):
get_links_form(channel,num,1)
if __name__ == '__main__': #避免调用程序出现名称混乱
pool = Pool()#多进程处理,开多少进程,如果有几核就开几个进程
urls = []
for url in url_list1:
urls.append(url['url'])
pool.map(get_all_links_from,urls) #如果是参数,那么传入应该为一个数组
#如果是三引号的数据,可以通过分割空格作为不同的一个数组
#pool.map(get_all_links_from,list.split())