#coding:utf-8
import requests
from lxml import etree
import os
import sys
import urllib
reload(sys)
sys.setdefaultencoding('utf-8')
path=os.path.dirname(__file__).decode('gbk')
print type(path)
req=requests.get("http://bbs.seller.aliexpress.com/bbs/thread.php?fid=34")
sys_url="http://bbs.seller.aliexpress.com/bbs/"
# html_text=req.content
# html=etree.HTML(html_text)
# titles=html.xpath(".//*[@id='threadlist']/tr/td[2]/a[last()]/text()") #文本标题
# filenames=html.xpath(".//*[@id='threadlist']/tr/td[2]/a[last()]/@id") #文件夹名称
# hrefs=html.xpath(".//*[@id='threadlist']/tr/td[2]/a[last()]/@href") #具体内容链接
# for title in titles:
# "用标题做文件夹名称"
# folder=os.path.join(path, filenames[titles.index(title)]) #文件夹
# if not os.path.exists(folder):
# os.makedirs(folder)
# "获取页面详情 存储文字和图片"
# html_page=requests.get(sys_url + hrefs[titles.index(title)])
# html_text=etree.HTML(html_page.content)
# "存储html"
# html_name= hrefs[titles.index(title)].split('=')[1] + '.html' #html 文件名
# print 'html_name',html_name
# if not os.path.exists(os.path.join(folder,html_name)):
# with open(os.path.join(folder, html_name), 'wb') as file:
# file.write(html_page.content)
# def download_pic(hrefs,path):
# """
# :param path: 图片的获取路径 list
# :return: 图片存储于本地
# """
# html_page = requests.get(sys_url + hrefs[titles.index(title)])
# html_text = etree.HTML(html_page.content)
# "提取图片,放到文件夹"
# imgs_url=html_text.xpath(path)
# print imgs_url
# for img_url in imgs_url:
# img_path=folder+'\\'+img_url.split('/')[-1] #图片存放
# if not os.path.exists(img_path):
# urllib.urlretrieve(img_url, folder + '\\' + img_url.split('/')[-1])
# return True
# def download_html(hrefs,split_key,num=1):
# "存储html"
# html_name= hrefs[titles.index(title)].split(split_key)[num] + '.html' #html 文件名
# print 'html_name',html_name
# if not os.path.exists(os.path.join(folder,html_name)):
# with open(os.path.join(folder, html_name), 'wb') as file:
# file.write(html_page.content)
def analysis_html(url,title_xpath,folder_xpath,href_xpath):
"解析网站 返回html的list"
req = requests.get(url)
html_text = req.content
html = etree.HTML(html_text)
titles=html.xpath(title_xpath)
folders=html.xpath(folder_xpath)
hrefs=html.xpath(href_xpath)
return titles,folders,hrefs
def download(titles,folders,hrefs):
"下载需要信息"
for title in titles:
"用标题做文件夹名称"
folder = os.path.join(path, folders[titles.index(title)]) # 文件夹
if not os.path.exists(folder):
os.makedirs(folder)
"获取页面详情 存储文字和图片"
html_page = requests.get(sys_url + hrefs[titles.index(title)])
html_text = etree.HTML(html_page.content)
"存储html"
html_name = hrefs[titles.index(title)].split('=')[1] + '.html' # html 文件名
print 'html_name', html_name
if not os.path.exists(os.path.join(folder, html_name)):
with open(os.path.join(folder, html_name), 'wb') as file:
file.write(html_page.content)
html_page = requests.get(sys_url + hrefs[titles.index(title)])
html_text = etree.HTML(html_page.content)
"提取图片,放到文件夹"
imgs_url = html_text.xpath(path)
print imgs_url
for img_url in imgs_url:
img_path = folder + '\\' + img_url.split('/')[-1] # 图片存放
if not os.path.exists(img_path):
urllib.urlretrieve(img_url, folder + '\\' + img_url.split('/')[-1])
requests lxml实战1
猜你喜欢
转载自blog.csdn.net/zhouxuan623/article/details/82805380
今日推荐
周排行