用selenium玩的是精准打击.
这是爬取指定汉字的笔顺拼音声音的小爬虫。速度慢,但是指向很灵活。只需要调整 yourtxt.txt 里面的
文件内容即可。
#coding:utf-8
from urllib.request import urlretrieve
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from time import sleep
import re
address = 'https://hanyu.baidu.com/'
shz = ' '
url = ' '
op = webdriver.FirefoxOptions()
op.add_argument("--headless") #等效于 ———— op.set_headless()
op.add_argument("--disable-gpu") #禁用GPU加速
driver = webdriver.Firefox(firefox_options = op)
driver.get(address)
ele = WebDriverWait(driver,10,0.2).until(ec.title_contains('百度汉语'))
def getvalue(shz):
try:
#ele = WebDriverWait(driver,10,0.2).until(ec.title_contains('百度汉语'))
trg = driver.find_element_by_id('kw')
trg.clear()
trg.send_keys(shz)
trg = driver.find_element_by_id('su')
trg.click()
print(shz)
except:
print('程序出现错误,请调试解决后运行')
exit()
quit()
try:
trg = driver.find_element_by_id('pc--body')
trg = driver.find_element_by_xpath(r'//*[@id="data-container"]/div[1]/div[1]/a')
trg.click()
print('有夹层')
except:
print('无夹层')
finally:
ele = WebDriverWait(driver,10).until(ec.visibility_of(driver.find_element(by=By.ID,value='pc-word-body')))
trg = driver.find_element_by_xpath(r'//*[@id="word_bishun"]') #抓取字符动画
url = trg.get_attribute('src')
urlretrieve(url,'./image/'+ shz + '.gif') #保存为以字符为名字的gif动图
print(url)
trg = driver.find_element_by_xpath(r'//*[@id="pinyin"]/span/a') #抓取读音
url= trg.get_attribute('url')
urlretrieve(url,'./mp3/'+ shz + '.mp3') #保存为以字符为名字的mp3格式
print(url)
pinyin = driver.find_element_by_xpath(r'//*[@id="pinyin"]/span/b').text
print(pinyin)
driver.back()
with open('pinyi.txt','a+',encoding = ('UTF-8-sig')) as f:
f.writelines(shz + ',' + pinyin + '\n')
ele = WebDriverWait(driver,10,0.2).until(ec.title_contains('百度汉语'))
print('抓取成功')
# main():
txt = []
with open('yourtxt.txt','r',encoding = ('UTF-8-sig')) as f:
txt = f.readlines()
for i in range(len(txt)):
try:
print(i + 1)
d = txt[i][-2]
getvalue(d)
except Exception as e:
print(e)
driver.close()
driver.quit()