1 from lxml import html 2 import requests 3 4 # using xpath 5 6 # page = requests.get('http://econpy.pythonanywhere.com/ex/001.html') 7 page = requests.get('https://nips.cc/Conferences/2019/Schedule') 8 tree = html.fromstring(page.content) 9 10 #This will create a list of buyers: 11 # buyers = tree.xpath('//div[@title="buyer-name"]/text()') 12 # test = tree.xpath('//*[@id="maincard_15788"]/div[3]') 13 # print(test) 14 15 16 17 doc = tree 18 # btags = doc.xpath("//*[@class[starts-with(., 'maincard narrower Oral') and string-length() > 3]]") 19 btags = doc.xpath("//*[@class[starts-with(., 'maincard narrower Spotlight') and string-length() > 3]]") 20 idx = 1 21 with open('nips_paperlist_spotlight.txt', 'w') as f: 22 for b in btags: 23 type = b.xpath("div[1]")[0].text 24 title = b.xpath("div[3]")[0].text 25 author = b.xpath("div[5]")[0].text 26 out_str = "%d, %s, %s, %s\n"%(idx, type, title, author) 27 print(out_str) 28 f.writelines(out_str) 29 # print(idx) 30 # print(type) 31 # print(title) 32 # print(author) 33 idx += 1
使用XPath
lxml, requests
https://docs.python-guide.org/scenarios/scrape/
https://stackoverflow.com/questions/12393858/xpath-using-contains-with-a-wildcard