Request+json爬取bilibili所有的番剧:
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
import json
import pymysql
# 获取连接对象
conn = pymysql.connect(host="localhost", user="root", password="root", database="pcdate", charset="utf8")
# 获取游标
c = conn.cursor()
base_url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page={}&season_type=1&pagesize=20&type=1"
index=0
i = 0
flag=False
while True:
heanders = {
"User-Agent": UserAgent().chrome
}
url = base_url.format(i + 1)
request = Request(url, headers=heanders)
reques = urlopen(request)
prin = reques.read().decode()
jsons=''
try:
jsons = json.loads(prin)
list = jsons['data']['list']
except Exception as e:
print('出现异常')
print(prin)
# print(list)
continue
for li in list:
# 插入到数据库 中
# print(li)
li['title']=str(li['title']).replace("'","")
c.execute(f"INSERT INTO `bilibili`(`title`, `badge`, `cover`, `index_show`, `is_finish`, `link`,`media_id`,`order`,`season_id`) VALUES ('{li['title']}', '{li['badge']}', '{li['cover']}', '{li['index_show']}', '{li['is_finish']}','{li['link']}','{li['media_id']}','{li['order']}','{li['season_id']}')");
conn.commit()
index+=1
print(f"爬取到番为:{li['title']},爬取总番数据为:{index}")
if li == "" or li is None or len(li) < 1:
flag=True
break
if flag:
break
i += 1
c.close()
conn.close()
PyQuery爬取某个代理网页
from pyquery import PyQuery as pq
import requests
from fake_useragent import UserAgent
import pymysql
# 获取连接对象
conn = pymysql.connect(host="localhost", user="root", password="root", database="pcdate",
charset="utf8")
# 获取游标
c = conn.cursor()
url = "http://www.xicidaili.com/nn/{}"
headers = {
"User-Agent": UserAgent().random
}
index = 0
index1 = 0
proxies={
'https':'123.149.137.221:9999',
'http':'123.149.137.85:9999'
}
while True:
request = requests.get(url.format(index + 1), headers=headers)
trs=pq(request.text)
table=trs('#ip_list tr')
for ta in range(1,len(table)):
td=table.eq(ta).find('td')
# ip
ip=td.eq(1).html()
# 端口
prot=td.eq(2).text()
# 服务器地址
address=td.eq(3).find('a').text()
# 是否匿名//
nim=td.eq(4).text()
# 请求类型 http https
http=td.eq(5).text()
# 速度
survive=td.eq(6).children().attr('title')
# 时长
time=td.eq(7).children().attr('title')
# 存活时间
runtime=td.eq(8).text()
# 验证时间
verification=td.eq(9).text()
c.execute(
f"INSERT INTO `proxy`(`ip`, `prot`, `address`, `nim`,`http`,`survive`,`time`,`runtimes`)"
f" VALUES ('{ip}', '{prot}', '{address}', '{nim}', '{http}', '{survive}', '{time}', '{runtime}')");
conn.commit()
index1+=1
print(f"已爬取数据共:{index1}")
index+=1
c.close()
conn.close()
xpath爬取起点小说的小说
from lxml import etree
import requests
from fake_useragent import UserAgent
import pymysql
# 获取连接对象
conn = pymysql.connect(host="localhost", user="root", password="root", database="pcdate",
charset="utf8")
# 获取游标
c = conn.cursor()
index = 0
i2 = 0
urls = "https://www.qidian.com/rank/yuepiao?chn=21&page={}"
flag = False
headers = {
"User-Agent": UserAgent().chrome
}
while True:
url = urls.format(i2 + 1)
response = requests.get(url, headers=headers)
e = etree.HTML(response.text)
names = e.xpath('//h4/a/text()')
authors = e.xpath('//h4/a/@href')
texts = e.xpath('.//p[@class="intro"]/text()')
user = e.xpath('.//p[@class="author"]/a[1]/text()')
for i in range(len(texts)):
name = names[i]
href = "https://" + authors[i]
# strip()去除空格
jj = str(texts[i].strip()).replace("'", "")
author = user[i]
c.execute(
f"INSERT INTO `qidian`(`title`, `href`, `remark`, `author`) VALUES ('{name}', '{href}', '{jj}', '{author}')");
conn.commit()
index += 1
print(f"已爬取小说:{name},爬取总数为:{index}")
i2 += 1
c.close()
conn.close()
# print(user)
# for name, author,text in zip(names, authors,texts):
# str1=str(text).replace(' ','')
# print(name, ":", "https://"+author,":",str1)
xpath爬取某个xx网站
from lxml import etree
import requests
from fake_useragent import UserAgent
url = "http://xxxxx.top/"
response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
e = etree.HTML(response.text)
menu=e.xpath('.//div[@class="wrap mt10 nav"]/ul[@class="nav_menu clearfix"]/li/a/@href')
menu2=e.xpath('.//div[@class="wrap mt10 nav"]/ul[@class="nav_menu clearfix"]/li/a/text()')
file=open('file.txt','w',encoding='utf-8')
# 菜单
for m in range(len(menu)):
st=str(menu[m])
st2=str(menu2[m])
if st[(len(st)-4):(len(st))]=='html':
url2=url+st
url2=url2[:len(url2)-5]+'-pg-{}.html'
# 记录页数
index=0
# 最大页数
maxindex=0
while True :
response = requests.get(url2.format(index+1), headers={"User-Agent": UserAgent().chrome})
index+=1
e = etree.HTML(response.text)
# 如果已经到达最大页数 那么将停止
if index==maxindex and maxindex>1:
break
maxindex2=''
try:
maxindex2=str(e.xpath('.//a[contains(text(),"尾页")]/@href')[0])
maxindex=int(maxindex2[len(maxindex2)-7:len(maxindex2)-5])
except Exception as e:
break
href=e.xpath('.//div[contains(@class,"movie_list")]/ul/li/a/@href')
title=e.xpath('.//li/a/@title')
image=e.xpath('.//li/a/img/@src')
for i in range(len(href)):
# 链接
hrefs=href[i]
# 标题
titles=title[i]
# 封面图片链接
images=image[i]
file.write(f"类型:{st2},链接:{url+hrefs},标题:{titles},封面链接:{images}")
file.write('\n')
file.flush()
file.close()
xpath爬取某个美女图片网址并下载
import requests
from fake_useragent import UserAgent
from lxml import etree
url = "https://tuchong.com/1485770/19399344/#image351010920"
response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
e = etree.HTML(response.text)
img_urls = e.xpath('//article/img/@src')
print(img_urls)
for url in img_urls:
response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
img_name = url[url.rfind('/')+1:]
with open('img/'+img_name, 'wb') as f:
f.write(response.content)
selenium爬取天猫口罩商品数据
from selenium import webdriver
from lxml import etree
from time import sleep
url ='https://list.tmall.com/search_product.htm?q=%BF%DA%D5%D6&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'
chrome = webdriver.Chrome()
chrome.get(url)
# js = 'document.documentElement.scrollTop=10000'
# chrome.execute_script(js)
# driver = webdriver.PhantomJS()
while True:
sleep(2)
html = chrome.page_source
e = etree.HTML(html)
names = e.xpath('//p[@class="productTitle"]/a/@title')
prices = e.xpath('//p[@class="productPrice"]/em/@title')
js = 'document.documentElement.scrollTop=10000'
chrome.execute_script(js)
for name, price in zip(names, prices):
print(f"名称:{name},价格:{price}")
# chrome.find_element_by_class_name('ui-page-next').click()
# chrome.quit()
selenium爬取虎牙直播的所有主播
from selenium import webdriver
from time import sleep
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
url = 'https://www.huya.com/g/lol'
driver.get(url)
num = 1
while True:
print('第' + str(num) + "页----------------------------------------------")
num += 1
sleep(5)
html = driver.page_source
names = driver.find_elements_by_xpath('//i[@class="nick"]')
counts = driver.find_elements_by_xpath('//i[@class="js-num"]')
titles=driver.find_elements_by_xpath('//a[contains(@class,"title new-clickstat")]')
#driver.find_element_by_xpath('//a[@class="laypage_nasda"]').click()
for name, count,title in zip(names, counts,titles):
print(name.text, ":", count.text,":",title.text)
if driver.page_source.find('laypage_next') != -1:
driver.find_element_by_xpath('//a[@class="laypage_next"]').click()
else:
break
driver.quit()
selenium爬取简书的文章
from selenium import webdriver
from time import sleep
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
url='https://www.jianshu.com/'
driver.get(url)
index=10000
for i in range(10):
js = 'document.documentElement.scrollTop={}'
driver.execute_script(js.format(index))
sleep(1)
index+=10000
driver.execute_script(js.format(index))
index+=10000
sleep(1)
driver.find_element_by_xpath('.//a[@class="load-more"]').click()
title=driver.find_elements_by_xpath('//a[@class="title"]')
text=driver.find_elements_by_xpath('//p[@class="abstract"]')
user=driver.find_elements_by_xpath('//a[@class="nickname"]')
for titles,texts,users in zip(title,text,user):
print(f"标题:{titles.text},内容简介:{texts.text.strip()},作者:{users.text.strip()}")